shithub: libvpx

--- a/configure

+++ b/configure

@@ -52,6 +52,7 @@

   ${toggle_multi_res_encoding}    enable multiple-resolution encoding

   ${toggle_temporal_denoising}    enable temporal denoising and disable the spatial denoiser

   ${toggle_webm_io}               enable input from and output to WebM container

+  ${toggle_libyuv}                enable libyuv

 Codecs:

   Codecs can be selectively enabled or disabled individually, or by family:

@@ -315,6 +316,7 @@

     os_support

     unit_tests

     webm_io

+    libyuv

     decode_perf_tests

     multi_res_encoding

     temporal_denoising

@@ -368,6 +370,7 @@

     postproc_visualizer

     unit_tests

     webm_io

+    libyuv

     decode_perf_tests

     multi_res_encoding

     temporal_denoising

@@ -709,9 +712,11 @@

         *-vs*)

             soft_enable unit_tests

             soft_enable webm_io

+            soft_enable libyuv

;;

         *-android-*)

             soft_enable webm_io

+            soft_enable libyuv

             # GTestLog must be modified to use Android logging utilities.

;;

         *-darwin-*)

@@ -728,6 +733,9 @@

             check_cxx "$@" <<EOF && soft_enable webm_io

 int z;

EOF

+            check_cxx "$@" <<EOF && soft_enable libyuv

+int z;

+EOF

;;

*)

             enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests

@@ -734,6 +742,9 @@

 int z;

EOF

             check_cxx "$@" <<EOF && soft_enable webm_io

+int z;

+EOF

+            check_cxx "$@" <<EOF && soft_enable libyuv

 int z;

EOF

;;

--- a/examples.mk

+++ b/examples.mk

@@ -10,10 +10,24 @@

 LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \

                 third_party/libyuv/include/libyuv/cpu_id.h  \

+                third_party/libyuv/include/libyuv/planar_functions.h  \

+                third_party/libyuv/include/libyuv/row.h  \

                 third_party/libyuv/include/libyuv/scale.h  \

-                third_party/libyuv/source/row.h \

-                third_party/libyuv/source/scale.c  \

-                third_party/libyuv/source/cpu_id.c

+                third_party/libyuv/include/libyuv/scale_row.h  \

+                third_party/libyuv/source/cpu_id.cc \

+                third_party/libyuv/source/planar_functions.cc \

+                third_party/libyuv/source/row_any.cc \

+                third_party/libyuv/source/row_common.cc \

+                third_party/libyuv/source/row_mips.cc \

+                third_party/libyuv/source/row_neon.cc \

+                third_party/libyuv/source/row_posix.cc \

+                third_party/libyuv/source/row_win.cc \

+                third_party/libyuv/source/scale.cc  \

+                third_party/libyuv/source/scale_common.cc \

+                third_party/libyuv/source/scale_mips.cc \

+                third_party/libyuv/source/scale_neon.cc \

+                third_party/libyuv/source/scale_posix.cc \

+                third_party/libyuv/source/scale_win.cc

 LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \

                       third_party/libwebm/mkvmuxerutil.cpp \

@@ -42,7 +56,9 @@

 vpxdec.SRCS                 += ivfdec.c ivfdec.h

 vpxdec.SRCS                 += tools_common.c tools_common.h

 vpxdec.SRCS                 += y4menc.c y4menc.h

-vpxdec.SRCS                 += $(LIBYUV_SRCS)

+ifeq ($(CONFIG_LIBYUV),yes)

+  vpxdec.SRCS                 += $(LIBYUV_SRCS)

+endif

 ifeq ($(CONFIG_WEBM_IO),yes)

   vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)

   vpxdec.SRCS                 += webmdec.cc webmdec.h

@@ -60,7 +76,9 @@

 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h

 vpxenc.SRCS                 += vpx_ports/vpx_timer.h

 vpxenc.SRCS                 += vpxstats.c vpxstats.h

-vpxenc.SRCS                 += $(LIBYUV_SRCS)

+ifeq ($(CONFIG_LIBYUV),yes)

+  vpxenc.SRCS                 += $(LIBYUV_SRCS)

+endif

 ifeq ($(CONFIG_WEBM_IO),yes)

   vpxenc.SRCS                 += $(LIBWEBM_MUXER_SRCS)

   vpxenc.SRCS                 += webmenc.cc webmenc.h

@@ -160,10 +178,12 @@

 ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)

+ifeq ($(CONFIG_LIBYUV),yes)

 EXAMPLES-$(CONFIG_VP8_DECODER)          += vp8_multi_resolution_encoder.c

 vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)

 vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de

 vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding

+endif

 endif

 # Handle extra library flags depending on codec configuration

--- a/third_party/libyuv/README.libvpx

+++ b/third_party/libyuv/README.libvpx

@@ -1,6 +1,6 @@

 Name: libyuv

 URL: http://code.google.com/p/libyuv/

-Version: 102

+Version: 1005

 License: BSD

 License File: LICENSE

@@ -13,5 +13,5 @@

 in order to encode multiple resolution bit streams.

 Local Modifications:

-Modified the original scaler code from C++ to C to fit in our current build

-system. This is a temporal solution, and will be improved later.

\ No newline at end of file

+Modified the original scaler code minimally with include file changes to fit

+in our current build system.

--- a/third_party/libyuv/include/libyuv/basic_types.h

+++ b/third_party/libyuv/include/libyuv/basic_types.h

@@ -1,22 +1,25 @@

/*

- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

  *  Use of this source code is governed by a BSD-style license

  *  that can be found in the LICENSE file in the root of the source

  *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

+ *  in the file PATENTS. All contributing project authors may

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_

+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT

 #define INCLUDE_LIBYUV_BASIC_TYPES_H_

 #include <stddef.h>  // for NULL, size_t

-#if !(defined(_MSC_VER) && (_MSC_VER < 1600))

+#if defined(__ANDROID__) || (defined(_MSC_VER) && (_MSC_VER < 1600))

+#include <sys/types.h>  // for uintptr_t on x86

+#else

 #include <stdint.h>  // for uintptr_t

 #endif

+#ifndef GG_LONGLONG

 #ifndef INT_TYPES_DEFINED

 #define INT_TYPES_DEFINED

 #ifdef COMPILER_MSVC

@@ -30,9 +33,9 @@

 #endif

 #define INT64_F "I64"

 #else  // COMPILER_MSVC

-#ifdef __LP64__

-typedef unsigned long uint64;

-typedef long int64;

+#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)

+typedef unsigned long uint64;  // NOLINT

+typedef long int64;  // NOLINT

 #ifndef INT64_C

 #define INT64_C(x) x ## L

 #endif

@@ -40,9 +43,9 @@

 #define UINT64_C(x) x ## UL

 #endif

 #define INT64_F "l"

-#else  // __LP64__

-typedef unsigned long long uint64;

-typedef long long int64;

+#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)

+typedef unsigned long long uint64;  // NOLINT

+typedef long long int64;  // NOLINT

 #ifndef INT64_C

 #define INT64_C(x) x ## LL

 #endif

@@ -54,11 +57,12 @@

 #endif  // COMPILER_MSVC

 typedef unsigned int uint32;

 typedef int int32;

-typedef unsigned short uint16;

-typedef short int16;

+typedef unsigned short uint16;  // NOLINT

+typedef short int16;  // NOLINT

 typedef unsigned char uint8;

-typedef char int8;

+typedef signed char int8;

 #endif  // INT_TYPES_DEFINED

+#endif  // GG_LONGLONG

 // Detect compiler is for x86 or x64.

 #if defined(__x86_64__) || defined(_M_X64) || \

@@ -65,9 +69,50 @@

     defined(__i386__) || defined(_M_IX86)

 #define CPU_X86 1

 #endif

+// Detect compiler is for ARM.

+#if defined(__arm__) || defined(_M_ARM)

+#define CPU_ARM 1

+#endif

+#ifndef ALIGNP

+#ifdef __cplusplus

 #define ALIGNP(p, t) \

-  ((uint8*)((((uintptr_t)(p) + \

-  ((t)-1)) & ~((t)-1))))

+    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \

+    ((t) - 1)) & ~((t) - 1))))

+#else

+#define ALIGNP(p, t) \

+    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */

+#endif

+#endif

-#endif // INCLUDE_LIBYUV_BASIC_TYPES_H_

+#if !defined(LIBYUV_API)

+#if defined(_WIN32) || defined(__CYGWIN__)

+#if defined(LIBYUV_BUILDING_SHARED_LIBRARY)

+#define LIBYUV_API __declspec(dllexport)

+#elif defined(LIBYUV_USING_SHARED_LIBRARY)

+#define LIBYUV_API __declspec(dllimport)

+#else

+#define LIBYUV_API

+#endif  // LIBYUV_BUILDING_SHARED_LIBRARY

+#elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \

+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \

+    defined(LIBYUV_USING_SHARED_LIBRARY))

+#define LIBYUV_API __attribute__ ((visibility ("default")))

+#else

+#define LIBYUV_API

+#endif  // __GNUC__

+#endif  // LIBYUV_API

+#define LIBYUV_BOOL int

+#define LIBYUV_FALSE 0

+#define LIBYUV_TRUE 1

+// Visual C x86 or GCC little endian.

+#if defined(__x86_64__) || defined(_M_X64) || \

+  defined(__i386__) || defined(_M_IX86) || \

+  defined(__arm__) || defined(_M_ARM) || \

+  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

+#define LIBYUV_LITTLE_ENDIAN

+#endif

+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/cpu_id.h

+++ b/third_party/libyuv/include/libyuv/cpu_id.h

@@ -1,49 +1,81 @@

/*

- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

  *  Use of this source code is governed by a BSD-style license

  *  that can be found in the LICENSE file in the root of the source

  *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

+ *  in the file PATENTS. All contributing project authors may

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_CPU_ID_H_

+#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT

 #define INCLUDE_LIBYUV_CPU_ID_H_

+#include "basic_types.h"

 #ifdef __cplusplus

 namespace libyuv {

 extern "C" {

 #endif

-// These flags are only valid on x86 processors

-static const int kCpuHasSSE2 = 1;

-static const int kCpuHasSSSE3 = 2;

+// TODO(fbarchard): Consider overlapping bits for different architectures.

+// Internal flag to indicate cpuid requires initialization.

+#define kCpuInit 0x1

-// These flags are only valid on ARM processors

-static const int kCpuHasNEON = 4;

+// These flags are only valid on ARM processors.

+static const int kCpuHasARM = 0x2;

+static const int kCpuHasNEON = 0x4;

+// 0x8 reserved for future ARM flag.

-// Internal flag to indicate cpuid is initialized.

-static const int kCpuInitialized = 8;

+// These flags are only valid on x86 processors.

+static const int kCpuHasX86 = 0x10;

+static const int kCpuHasSSE2 = 0x20;

+static const int kCpuHasSSSE3 = 0x40;

+static const int kCpuHasSSE41 = 0x80;

+static const int kCpuHasSSE42 = 0x100;

+static const int kCpuHasAVX = 0x200;

+static const int kCpuHasAVX2 = 0x400;

+static const int kCpuHasERMS = 0x800;

+static const int kCpuHasFMA3 = 0x1000;

+// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.

+// These flags are only valid on MIPS processors.

+static const int kCpuHasMIPS = 0x10000;

+static const int kCpuHasMIPS_DSP = 0x20000;

+static const int kCpuHasMIPS_DSPR2 = 0x40000;

+// Internal function used to auto-init.

+LIBYUV_API

+int InitCpuFlags(void);

+// Internal function for parsing /proc/cpuinfo.

+LIBYUV_API

+int ArmCpuCaps(const char* cpuinfo_name);

 // Detect CPU has SSE2 etc.

-// test_flag parameter should be one of kCpuHas constants above

+// Test_flag parameter should be one of kCpuHas constants above.

 // returns non-zero if instruction set is detected

 static __inline int TestCpuFlag(int test_flag) {

-  extern int cpu_info_;

-  extern int InitCpuFlags();

-  return (cpu_info_ ? cpu_info_ : InitCpuFlags()) & test_flag;

+  LIBYUV_API extern int cpu_info_;

+  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;

 // For testing, allow CPU flags to be disabled.

 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.

-// -1 to enable all cpu specific optimizations.

-// 0 to disable all cpu specific optimizations.

+// MaskCpuFlags(-1) to enable all cpu specific optimizations.

+// MaskCpuFlags(0) to disable all cpu specific optimizations.

+LIBYUV_API

 void MaskCpuFlags(int enable_flags);

+// Low level cpuid for X86. Returns zeros on other CPUs.

+// eax is the info type that you want.

+// ecx is typically the cpu number, and should normally be zero.

+LIBYUV_API

+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);

 #ifdef __cplusplus

 }  // extern "C"

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_CPU_ID_H_

+#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/planar_functions.h

@@ -1,0 +1,439 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT

+#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_

+#include "basic_types.h"

+// TODO(fbarchard): Remove the following headers includes.

+// #include "convert.h"

+// #include "convert_argb.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Copy a plane of data.

+LIBYUV_API

+void CopyPlane(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height);

+LIBYUV_API

+void CopyPlane_16(const uint16* src_y, int src_stride_y,

+                  uint16* dst_y, int dst_stride_y,

+                  int width, int height);

+// Set a plane of data to a 32 bit value.

+LIBYUV_API

+void SetPlane(uint8* dst_y, int dst_stride_y,

+              int width, int height,

+              uint32 value);

+// Copy I400.  Supports inverting.

+LIBYUV_API

+int I400ToI400(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height);

+// Copy I422 to I422.

+#define I422ToI422 I422Copy

+LIBYUV_API

+int I422Copy(const uint8* src_y, int src_stride_y,

+             const uint8* src_u, int src_stride_u,

+             const uint8* src_v, int src_stride_v,

+             uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int width, int height);

+// Copy I444 to I444.

+#define I444ToI444 I444Copy

+LIBYUV_API

+int I444Copy(const uint8* src_y, int src_stride_y,

+             const uint8* src_u, int src_stride_u,

+             const uint8* src_v, int src_stride_v,

+             uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int width, int height);

+// Convert YUY2 to I422.

+LIBYUV_API

+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert UYVY to I422.

+LIBYUV_API

+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Convert I420 to I400. (calls CopyPlane ignoring u/v).

+LIBYUV_API

+int I420ToI400(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height);

+// Alias

+#define I420ToI420Mirror I420Mirror

+// I420 mirror.

+LIBYUV_API

+int I420Mirror(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height);

+// Alias

+#define I400ToI400Mirror I400Mirror

+// I400 mirror.  A single plane is mirrored horizontally.

+// Pass negative height to achieve 180 degree rotation.

+LIBYUV_API

+int I400Mirror(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height);

+// Alias

+#define ARGBToARGBMirror ARGBMirror

+// ARGB mirror.

+LIBYUV_API

+int ARGBMirror(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Convert NV12 to RGB565.

+LIBYUV_API

+int NV12ToRGB565(const uint8* src_y, int src_stride_y,

+                 const uint8* src_uv, int src_stride_uv,

+                 uint8* dst_rgb565, int dst_stride_rgb565,

+                 int width, int height);

+// Convert NV21 to RGB565.

+LIBYUV_API

+int NV21ToRGB565(const uint8* src_y, int src_stride_y,

+                 const uint8* src_uv, int src_stride_uv,

+                 uint8* dst_rgb565, int dst_stride_rgb565,

+                 int width, int height);

+// I422ToARGB is in convert_argb.h

+// Convert I422 to BGRA.

+LIBYUV_API

+int I422ToBGRA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_bgra, int dst_stride_bgra,

+               int width, int height);

+// Convert I422 to ABGR.

+LIBYUV_API

+int I422ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height);

+// Convert I422 to RGBA.

+LIBYUV_API

+int I422ToRGBA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_rgba, int dst_stride_rgba,

+               int width, int height);

+// Draw a rectangle into I420.

+LIBYUV_API

+int I420Rect(uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int x, int y, int width, int height,

+             int value_y, int value_u, int value_v);

+// Draw a rectangle into ARGB.

+LIBYUV_API

+int ARGBRect(uint8* dst_argb, int dst_stride_argb,

+             int x, int y, int width, int height, uint32 value);

+// Convert ARGB to gray scale ARGB.

+LIBYUV_API

+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height);

+// Make a rectangle of ARGB gray scale.

+LIBYUV_API

+int ARGBGray(uint8* dst_argb, int dst_stride_argb,

+             int x, int y, int width, int height);

+// Make a rectangle of ARGB Sepia tone.

+LIBYUV_API

+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,

+              int x, int y, int width, int height);

+// Apply a matrix rotation to each ARGB pixel.

+// matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.

+// The first 4 coefficients apply to B, G, R, A and produce B of the output.

+// The next 4 coefficients apply to B, G, R, A and produce G of the output.

+// The next 4 coefficients apply to B, G, R, A and produce R of the output.

+// The last 4 coefficients apply to B, G, R, A and produce A of the output.

+LIBYUV_API

+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_argb, int dst_stride_argb,

+                    const int8* matrix_argb,

+                    int width, int height);

+// Deprecated. Use ARGBColorMatrix instead.

+// Apply a matrix rotation to each ARGB pixel.

+// matrix_argb is 3 signed ARGB values. -128 to 127 representing -1 to 1.

+// The first 4 coefficients apply to B, G, R, A and produce B of the output.

+// The next 4 coefficients apply to B, G, R, A and produce G of the output.

+// The last 4 coefficients apply to B, G, R, A and produce R of the output.

+LIBYUV_API

+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,

+                   const int8* matrix_rgb,

+                   int x, int y, int width, int height);

+// Apply a color table each ARGB pixel.

+// Table contains 256 ARGB values.

+LIBYUV_API

+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,

+                   const uint8* table_argb,

+                   int x, int y, int width, int height);

+// Apply a color table each ARGB pixel but preserve destination alpha.

+// Table contains 256 ARGB values.

+LIBYUV_API

+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,

+                  const uint8* table_argb,

+                  int x, int y, int width, int height);

+// Apply a luma/color table each ARGB pixel but preserve destination alpha.

+// Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from

+// RGB (YJ style) and C is an 8 bit color component (R, G or B).

+LIBYUV_API

+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,

+                       uint8* dst_argb, int dst_stride_argb,

+                       const uint8* luma_rgb_table,

+                       int width, int height);

+// Apply a 3 term polynomial to ARGB values.

+// poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is

+// coefficients for b, g, r and a.  The 3rd row is coefficients for b squared,

+// g squared, r squared and a squared.  The 4rd row is coefficients for b to

+// the 3, g to the 3, r to the 3 and a to the 3.  The values are summed and

+// result clamped to 0 to 255.

+// A polynomial approximation can be dirived using software such as 'R'.

+LIBYUV_API

+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,

+                   uint8* dst_argb, int dst_stride_argb,

+                   const float* poly,

+                   int width, int height);

+// Quantize a rectangle of ARGB. Alpha unaffected.

+// scale is a 16 bit fractional fixed point scaler between 0 and 65535.

+// interval_size should be a value between 1 and 255.

+// interval_offset should be a value between 0 and 255.

+LIBYUV_API

+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,

+                 int scale, int interval_size, int interval_offset,

+                 int x, int y, int width, int height);

+// Copy ARGB to ARGB.

+LIBYUV_API

+int ARGBCopy(const uint8* src_argb, int src_stride_argb,

+             uint8* dst_argb, int dst_stride_argb,

+             int width, int height);

+// Copy ARGB to ARGB.

+LIBYUV_API

+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,

+                  uint8* dst_argb, int dst_stride_argb,

+                  int width, int height);

+// Copy ARGB to ARGB.

+LIBYUV_API

+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,

+                     uint8* dst_argb, int dst_stride_argb,

+                     int width, int height);

+typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,

+                             uint8* dst_argb, int width);

+// Get function to Alpha Blend ARGB pixels and store to destination.

+LIBYUV_API

+ARGBBlendRow GetARGBBlend();

+// Alpha Blend ARGB images and store to destination.

+// Alpha of destination is set to 255.

+LIBYUV_API

+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,

+              const uint8* src_argb1, int src_stride_argb1,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height);

+// Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.

+LIBYUV_API

+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,

+                 const uint8* src_argb1, int src_stride_argb1,

+                 uint8* dst_argb, int dst_stride_argb,

+                 int width, int height);

+// Add ARGB image with ARGB image. Saturates to 255.

+LIBYUV_API

+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,

+            const uint8* src_argb1, int src_stride_argb1,

+            uint8* dst_argb, int dst_stride_argb,

+            int width, int height);

+// Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.

+LIBYUV_API

+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,

+                 const uint8* src_argb1, int src_stride_argb1,

+                 uint8* dst_argb, int dst_stride_argb,

+                 int width, int height);

+// Convert I422 to YUY2.

+LIBYUV_API

+int I422ToYUY2(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_frame, int dst_stride_frame,

+               int width, int height);

+// Convert I422 to UYVY.

+LIBYUV_API

+int I422ToUYVY(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_frame, int dst_stride_frame,

+               int width, int height);

+// Convert unattentuated ARGB to preattenuated ARGB.

+LIBYUV_API

+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,

+                  uint8* dst_argb, int dst_stride_argb,

+                  int width, int height);

+// Convert preattentuated ARGB to unattenuated ARGB.

+LIBYUV_API

+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height);

+// Convert MJPG to ARGB.

+LIBYUV_API

+int MJPGToARGB(const uint8* sample, size_t sample_size,

+               uint8* argb, int argb_stride,

+               int w, int h, int dw, int dh);

+// Internal function - do not call directly.

+// Computes table of cumulative sum for image where the value is the sum

+// of all values above and to the left of the entry. Used by ARGBBlur.

+LIBYUV_API

+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,

+                             int32* dst_cumsum, int dst_stride32_cumsum,

+                             int width, int height);

+// Blur ARGB image.

+// dst_cumsum table of width * (height + 1) * 16 bytes aligned to

+//   16 byte boundary.

+// dst_stride32_cumsum is number of ints in a row (width * 4).

+// radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.

+// Blur is optimized for radius of 5 (11x11) or less.

+LIBYUV_API

+int ARGBBlur(const uint8* src_argb, int src_stride_argb,

+             uint8* dst_argb, int dst_stride_argb,

+             int32* dst_cumsum, int dst_stride32_cumsum,

+             int width, int height, int radius);

+// Multiply ARGB image by ARGB value.

+LIBYUV_API

+int ARGBShade(const uint8* src_argb, int src_stride_argb,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height, uint32 value);

+// Interpolate between two ARGB images using specified amount of interpolation

+// (0 to 255) and store to destination.

+// 'interpolation' is specified as 8 bit fraction where 0 means 100% src_argb0

+// and 255 means 1% src_argb0 and 99% src_argb1.

+// Internally uses ARGBScale bilinear filtering.

+// Caveat: This function will write up to 16 bytes beyond the end of dst_argb.

+LIBYUV_API

+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,

+                    const uint8* src_argb1, int src_stride_argb1,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height, int interpolation);

+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \

+    defined(TARGET_IPHONE_SIMULATOR)

+#define LIBYUV_DISABLE_X86

+#endif

+// Row functions for copying a pixels from a source with a slope to a row

+// of destination. Useful for scaling, rotation, mirror, texture mapping.

+LIBYUV_API

+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,

+                     uint8* dst_argb, const float* uv_dudv, int width);

+// The following are available on all x86 platforms:

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

+LIBYUV_API

+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

+                        uint8* dst_argb, const float* uv_dudv, int width);

+#define HAS_ARGBAFFINEROW_SSE2

+#endif  // LIBYUV_DISABLE_X86

+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.

+// shuffler is 16 bytes and must be aligned.

+LIBYUV_API

+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,

+                uint8* dst_argb, int dst_stride_argb,

+                const uint8* shuffler, int width, int height);

+// Sobel ARGB effect with planar output.

+LIBYUV_API

+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,

+                     uint8* dst_y, int dst_stride_y,

+                     int width, int height);

+// Sobel ARGB effect.

+LIBYUV_API

+int ARGBSobel(const uint8* src_argb, int src_stride_argb,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height);

+// Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.

+LIBYUV_API

+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,

+                uint8* dst_argb, int dst_stride_argb,

+                int width, int height);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/row.h

@@ -1,0 +1,1704 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT

+#define INCLUDE_LIBYUV_ROW_H_

+#include <stdlib.h>  // For malloc.

+#include "basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))

+#ifdef __cplusplus

+#define align_buffer_64(var, size)                                             \

+  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \

+  uint8* var = reinterpret_cast<uint8*>                                        \

+      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)

+#else

+#define align_buffer_64(var, size)                                             \

+  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \

+  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */

+#endif

+#define free_aligned_buffer_64(var) \

+  free(var##_mem);  \

+  var = 0

+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \

+    defined(TARGET_IPHONE_SIMULATOR)

+#define LIBYUV_DISABLE_X86

+#endif

+// True if compiling for SSSE3 as a requirement.

+#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))

+#define LIBYUV_SSSE3_ONLY

+#endif

+// Enable for NaCL pepper 33 for bundle and AVX2 support.

+//  #define NEW_BINUTILS

+// The following are available on all x86 platforms:

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

+// Effects:

+#define HAS_ARGBADDROW_SSE2

+#define HAS_ARGBAFFINEROW_SSE2

+#define HAS_ARGBATTENUATEROW_SSSE3

+#define HAS_ARGBBLENDROW_SSSE3

+#define HAS_ARGBCOLORMATRIXROW_SSSE3

+#define HAS_ARGBCOLORTABLEROW_X86

+#define HAS_ARGBCOPYALPHAROW_SSE2

+#define HAS_ARGBCOPYYTOALPHAROW_SSE2

+#define HAS_ARGBGRAYROW_SSSE3

+#define HAS_ARGBLUMACOLORTABLEROW_SSSE3

+#define HAS_ARGBMIRRORROW_SSSE3

+#define HAS_ARGBMULTIPLYROW_SSE2

+#define HAS_ARGBPOLYNOMIALROW_SSE2

+#define HAS_ARGBQUANTIZEROW_SSE2

+#define HAS_ARGBSEPIAROW_SSSE3

+#define HAS_ARGBSHADEROW_SSE2

+#define HAS_ARGBSUBTRACTROW_SSE2

+#define HAS_ARGBTOUVROW_SSSE3

+#define HAS_ARGBUNATTENUATEROW_SSE2

+#define HAS_COMPUTECUMULATIVESUMROW_SSE2

+#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

+#define HAS_INTERPOLATEROW_SSE2

+#define HAS_INTERPOLATEROW_SSSE3

+#define HAS_RGBCOLORTABLEROW_X86

+#define HAS_SOBELROW_SSE2

+#define HAS_SOBELTOPLANEROW_SSE2

+#define HAS_SOBELXROW_SSE2

+#define HAS_SOBELXYROW_SSE2

+#define HAS_SOBELYROW_SSE2

+// Conversions:

+#define HAS_ABGRTOUVROW_SSSE3

+#define HAS_ABGRTOYROW_SSSE3

+#define HAS_ARGB1555TOARGBROW_SSE2

+#define HAS_ARGB4444TOARGBROW_SSE2

+#define HAS_ARGBSHUFFLEROW_SSE2

+#define HAS_ARGBSHUFFLEROW_SSSE3

+#define HAS_ARGBTOARGB1555ROW_SSE2

+#define HAS_ARGBTOARGB4444ROW_SSE2

+#define HAS_ARGBTOBAYERGGROW_SSE2

+#define HAS_ARGBTOBAYERROW_SSSE3

+#define HAS_ARGBTORAWROW_SSSE3

+#define HAS_ARGBTORGB24ROW_SSSE3

+#define HAS_ARGBTORGB565ROW_SSE2

+#define HAS_ARGBTOUV422ROW_SSSE3

+#define HAS_ARGBTOUV444ROW_SSSE3

+#define HAS_ARGBTOUVJROW_SSSE3

+#define HAS_ARGBTOYJROW_SSSE3

+#define HAS_ARGBTOYROW_SSSE3

+#define HAS_BGRATOUVROW_SSSE3

+#define HAS_BGRATOYROW_SSSE3

+#define HAS_COPYROW_ERMS

+#define HAS_COPYROW_SSE2

+#define HAS_COPYROW_X86

+#define HAS_HALFROW_SSE2

+#define HAS_I400TOARGBROW_SSE2

+#define HAS_I411TOARGBROW_SSSE3

+#define HAS_I422TOARGB1555ROW_SSSE3

+#define HAS_I422TOABGRROW_SSSE3

+#define HAS_I422TOARGB1555ROW_SSSE3

+#define HAS_I422TOARGB4444ROW_SSSE3

+#define HAS_I422TOARGBROW_SSSE3

+#define HAS_I422TOBGRAROW_SSSE3

+#define HAS_I422TORAWROW_SSSE3

+#define HAS_I422TORGB24ROW_SSSE3

+#define HAS_I422TORGB565ROW_SSSE3

+#define HAS_I422TORGBAROW_SSSE3

+#define HAS_I422TOUYVYROW_SSE2

+#define HAS_I422TOYUY2ROW_SSE2

+#define HAS_I444TOARGBROW_SSSE3

+#define HAS_MERGEUVROW_SSE2

+#define HAS_MIRRORROW_SSE2

+#define HAS_MIRRORROW_SSSE3

+#define HAS_MIRRORROW_UV_SSSE3

+#define HAS_MIRRORUVROW_SSSE3

+#define HAS_NV12TOARGBROW_SSSE3

+#define HAS_NV12TORGB565ROW_SSSE3

+#define HAS_NV21TOARGBROW_SSSE3

+#define HAS_NV21TORGB565ROW_SSSE3

+#define HAS_RAWTOARGBROW_SSSE3

+#define HAS_RAWTOYROW_SSSE3

+#define HAS_RGB24TOARGBROW_SSSE3

+#define HAS_RGB24TOYROW_SSSE3

+#define HAS_RGB565TOARGBROW_SSE2

+#define HAS_RGBATOUVROW_SSSE3

+#define HAS_RGBATOYROW_SSSE3

+#define HAS_SETROW_X86

+#define HAS_SPLITUVROW_SSE2

+#define HAS_UYVYTOARGBROW_SSSE3

+#define HAS_UYVYTOUV422ROW_SSE2

+#define HAS_UYVYTOUVROW_SSE2

+#define HAS_UYVYTOYROW_SSE2

+#define HAS_YTOARGBROW_SSE2

+#define HAS_YUY2TOARGBROW_SSSE3

+#define HAS_YUY2TOUV422ROW_SSE2

+#define HAS_YUY2TOUVROW_SSE2

+#define HAS_YUY2TOYROW_SSE2

+#endif

+// GCC >= 4.7.0 required for AVX2.

+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))

+#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))

+#define GCC_HAS_AVX2 1

+#endif  // GNUC >= 4.7

+#endif  // __GNUC__

+// clang >= 3.4.0 required for AVX2.

+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))

+#if (__clang_major__ > 3) || (__clang_major__ == 3 && (__clang_minor__ >= 4))

+#define CLANG_HAS_AVX2 1

+#endif  // clang >= 3.4

+#endif  // __clang__

+// Visual C 2012 required for AVX2.

+#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700

+#define VISUALC_HAS_AVX2 1

+#endif  // VisualStudio >= 2012

+// The following are available on all x86 platforms, but

+// require VS2012, clang 3.4 or gcc 4.7.

+// The code supports NaCL but requires a new compiler and validator.

+#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \

+    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))

+// Effects:

+#define HAS_ARGBPOLYNOMIALROW_AVX2

+#define HAS_ARGBSHUFFLEROW_AVX2

+#define HAS_ARGBCOPYALPHAROW_AVX2

+#define HAS_ARGBCOPYYTOALPHAROW_AVX2

+#endif

+// The following are require VS2012.

+// TODO(fbarchard): Port to gcc.

+#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2)

+#define HAS_ARGBTOUVROW_AVX2

+#define HAS_ARGBTOYJROW_AVX2

+#define HAS_ARGBTOYROW_AVX2

+#define HAS_HALFROW_AVX2

+#define HAS_I422TOARGBROW_AVX2

+#define HAS_INTERPOLATEROW_AVX2

+#define HAS_MERGEUVROW_AVX2

+#define HAS_MIRRORROW_AVX2

+#define HAS_SPLITUVROW_AVX2

+#define HAS_UYVYTOUV422ROW_AVX2

+#define HAS_UYVYTOUVROW_AVX2

+#define HAS_UYVYTOYROW_AVX2

+#define HAS_YUY2TOUV422ROW_AVX2

+#define HAS_YUY2TOUVROW_AVX2

+#define HAS_YUY2TOYROW_AVX2

+// Effects:

+#define HAS_ARGBADDROW_AVX2

+#define HAS_ARGBATTENUATEROW_AVX2

+#define HAS_ARGBMIRRORROW_AVX2

+#define HAS_ARGBMULTIPLYROW_AVX2

+#define HAS_ARGBSUBTRACTROW_AVX2

+#define HAS_ARGBUNATTENUATEROW_AVX2

+#endif  // defined(VISUALC_HAS_AVX2)

+// The following are Yasm x86 only:

+// TODO(fbarchard): Port AVX2 to inline.

+#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)

+    (defined(_M_IX86) || defined(_M_X64) || \

+    defined(__x86_64__) || defined(__i386__))

+#define HAS_MERGEUVROW_AVX2

+#define HAS_MERGEUVROW_MMX

+#define HAS_SPLITUVROW_AVX2

+#define HAS_SPLITUVROW_MMX

+#define HAS_UYVYTOYROW_AVX2

+#define HAS_UYVYTOYROW_MMX

+#define HAS_YUY2TOYROW_AVX2

+#define HAS_YUY2TOYROW_MMX

+#endif

+// The following are disabled when SSSE3 is available:

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \

+    !defined(LIBYUV_SSSE3_ONLY)

+#define HAS_ARGBBLENDROW_SSE2

+#define HAS_ARGBATTENUATEROW_SSE2

+#define HAS_MIRRORROW_SSE2

+#endif

+// The following are available on Neon platforms:

+#if !defined(LIBYUV_DISABLE_NEON) && \

+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

+#define HAS_ABGRTOUVROW_NEON

+#define HAS_ABGRTOYROW_NEON

+#define HAS_ARGB1555TOARGBROW_NEON

+#define HAS_ARGB1555TOUVROW_NEON

+#define HAS_ARGB1555TOYROW_NEON

+#define HAS_ARGB4444TOARGBROW_NEON

+#define HAS_ARGB4444TOUVROW_NEON

+#define HAS_ARGB4444TOYROW_NEON

+#define HAS_ARGBTOARGB1555ROW_NEON

+#define HAS_ARGBTOARGB4444ROW_NEON

+#define HAS_ARGBTOBAYERROW_NEON

+#define HAS_ARGBTOBAYERGGROW_NEON

+#define HAS_ARGBTORAWROW_NEON

+#define HAS_ARGBTORGB24ROW_NEON

+#define HAS_ARGBTORGB565ROW_NEON

+#define HAS_ARGBTOUV411ROW_NEON

+#define HAS_ARGBTOUV422ROW_NEON

+#define HAS_ARGBTOUV444ROW_NEON

+#define HAS_ARGBTOUVROW_NEON

+#define HAS_ARGBTOUVJROW_NEON

+#define HAS_ARGBTOYROW_NEON

+#define HAS_ARGBTOYJROW_NEON

+#define HAS_BGRATOUVROW_NEON

+#define HAS_BGRATOYROW_NEON

+#define HAS_COPYROW_NEON

+#define HAS_HALFROW_NEON

+#define HAS_I400TOARGBROW_NEON

+#define HAS_I411TOARGBROW_NEON

+#define HAS_I422TOABGRROW_NEON

+#define HAS_I422TOARGB1555ROW_NEON

+#define HAS_I422TOARGB4444ROW_NEON

+#define HAS_I422TOARGBROW_NEON

+#define HAS_I422TOBGRAROW_NEON

+#define HAS_I422TORAWROW_NEON

+#define HAS_I422TORGB24ROW_NEON

+#define HAS_I422TORGB565ROW_NEON

+#define HAS_I422TORGBAROW_NEON

+#define HAS_I422TOUYVYROW_NEON

+#define HAS_I422TOYUY2ROW_NEON

+#define HAS_I444TOARGBROW_NEON

+#define HAS_MERGEUVROW_NEON

+#define HAS_MIRRORROW_NEON

+#define HAS_MIRRORUVROW_NEON

+#define HAS_NV12TOARGBROW_NEON

+#define HAS_NV12TORGB565ROW_NEON

+#define HAS_NV21TOARGBROW_NEON

+#define HAS_NV21TORGB565ROW_NEON

+#define HAS_RAWTOARGBROW_NEON

+#define HAS_RAWTOUVROW_NEON

+#define HAS_RAWTOYROW_NEON

+#define HAS_RGB24TOARGBROW_NEON

+#define HAS_RGB24TOUVROW_NEON

+#define HAS_RGB24TOYROW_NEON

+#define HAS_RGB565TOARGBROW_NEON

+#define HAS_RGB565TOUVROW_NEON

+#define HAS_RGB565TOYROW_NEON

+#define HAS_RGBATOUVROW_NEON

+#define HAS_RGBATOYROW_NEON

+#define HAS_SETROW_NEON

+#define HAS_SPLITUVROW_NEON

+#define HAS_UYVYTOARGBROW_NEON

+#define HAS_UYVYTOUV422ROW_NEON

+#define HAS_UYVYTOUVROW_NEON

+#define HAS_UYVYTOYROW_NEON

+#define HAS_YTOARGBROW_NEON

+#define HAS_YUY2TOARGBROW_NEON

+#define HAS_YUY2TOUV422ROW_NEON

+#define HAS_YUY2TOUVROW_NEON

+#define HAS_YUY2TOYROW_NEON

+// Effects:

+#define HAS_ARGBADDROW_NEON

+#define HAS_ARGBATTENUATEROW_NEON

+#define HAS_ARGBBLENDROW_NEON

+#define HAS_ARGBGRAYROW_NEON

+#define HAS_ARGBMIRRORROW_NEON

+#define HAS_ARGBMULTIPLYROW_NEON

+#define HAS_ARGBQUANTIZEROW_NEON

+#define HAS_ARGBSEPIAROW_NEON

+#define HAS_ARGBSHADEROW_NEON

+#define HAS_ARGBSUBTRACTROW_NEON

+#define HAS_SOBELROW_NEON

+#define HAS_SOBELTOPLANEROW_NEON

+#define HAS_SOBELXYROW_NEON

+#define HAS_SOBELXROW_NEON

+#define HAS_SOBELYROW_NEON

+#define HAS_INTERPOLATEROW_NEON

+// TODO(fbarchard): Investigate neon unittest failure.

+// #define HAS_ARGBCOLORMATRIXROW_NEON

+#endif

+// The following are available on Mips platforms:

+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)

+#define HAS_COPYROW_MIPS

+#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)

+#define HAS_I422TOABGRROW_MIPS_DSPR2

+#define HAS_I422TOARGBROW_MIPS_DSPR2

+#define HAS_I422TOBGRAROW_MIPS_DSPR2

+#define HAS_INTERPOLATEROWS_MIPS_DSPR2

+#define HAS_MIRRORROW_MIPS_DSPR2

+#define HAS_MIRRORUVROW_MIPS_DSPR2

+#define HAS_SPLITUVROW_MIPS_DSPR2

+#endif

+#endif

+#if defined(_MSC_VER) && !defined(__CLR_VER)

+#define SIMD_ALIGNED(var) __declspec(align(16)) var

+typedef __declspec(align(16)) int16 vec16[8];

+typedef __declspec(align(16)) int32 vec32[4];

+typedef __declspec(align(16)) int8 vec8[16];

+typedef __declspec(align(16)) uint16 uvec16[8];

+typedef __declspec(align(16)) uint32 uvec32[4];

+typedef __declspec(align(16)) uint8 uvec8[16];

+typedef __declspec(align(32)) int16 lvec16[16];

+typedef __declspec(align(32)) int32 lvec32[8];

+typedef __declspec(align(32)) int8 lvec8[32];

+typedef __declspec(align(32)) uint16 ulvec16[16];

+typedef __declspec(align(32)) uint32 ulvec32[8];

+typedef __declspec(align(32)) uint8 ulvec8[32];

+#elif defined(__GNUC__)

+// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.

+#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))

+typedef int16 __attribute__((vector_size(16))) vec16;

+typedef int32 __attribute__((vector_size(16))) vec32;

+typedef int8 __attribute__((vector_size(16))) vec8;

+typedef uint16 __attribute__((vector_size(16))) uvec16;

+typedef uint32 __attribute__((vector_size(16))) uvec32;

+typedef uint8 __attribute__((vector_size(16))) uvec8;

+#else

+#define SIMD_ALIGNED(var) var

+typedef int16 vec16[8];

+typedef int32 vec32[4];

+typedef int8 vec8[16];

+typedef uint16 uvec16[8];

+typedef uint32 uvec32[4];

+typedef uint8 uvec8[16];

+#endif

+#if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)

+#define OMITFP

+#else

+#define OMITFP __attribute__((optimize("omit-frame-pointer")))

+#endif

+// NaCL macros for GCC x86 and x64.

+// TODO(nfullagar): When pepper_33 toolchain is distributed, default to

+// NEW_BINUTILS and remove all BUNDLEALIGN occurances.

+#if defined(__native_client__)

+#define LABELALIGN ".p2align 5\n"

+#else

+#define LABELALIGN ".p2align 2\n"

+#endif

+#if defined(__native_client__) && defined(__x86_64__)

+#if defined(NEW_BINUTILS)

+#define BUNDLELOCK ".bundle_lock\n"

+#define BUNDLEUNLOCK ".bundle_unlock\n"

+#define BUNDLEALIGN "\n"

+#else

+#define BUNDLELOCK "\n"

+#define BUNDLEUNLOCK "\n"

+#define BUNDLEALIGN ".p2align 5\n"

+#endif

+#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"

+#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"

+#define MEMLEA(offset, base) #offset "(%q" #base ")"

+#define MEMLEA3(offset, index, scale) \

+    #offset "(,%q" #index "," #scale ")"

+#define MEMLEA4(offset, base, index, scale) \

+    #offset "(%q" #base ",%q" #index "," #scale ")"

+#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"

+#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"

+#define MEMOPREG(opcode, offset, base, index, scale, reg) \

+    BUNDLELOCK \

+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

+    #opcode " (%%r15,%%r14),%%" #reg "\n" \

+    BUNDLEUNLOCK

+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \

+    BUNDLELOCK \

+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

+    #opcode " %%" #reg ",(%%r15,%%r14)\n" \

+    BUNDLEUNLOCK

+#define MEMOPARG(opcode, offset, base, index, scale, arg) \

+    BUNDLELOCK \

+    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

+    #opcode " (%%r15,%%r14),%" #arg "\n" \

+    BUNDLEUNLOCK

+#else

+#define BUNDLEALIGN "\n"

+#define MEMACCESS(base) "(%" #base ")"

+#define MEMACCESS2(offset, base) #offset "(%" #base ")"

+#define MEMLEA(offset, base) #offset "(%" #base ")"

+#define MEMLEA3(offset, index, scale) \

+    #offset "(,%" #index "," #scale ")"

+#define MEMLEA4(offset, base, index, scale) \

+    #offset "(%" #base ",%" #index "," #scale ")"

+#define MEMMOVESTRING(s, d)

+#define MEMSTORESTRING(reg, d)

+#define MEMOPREG(opcode, offset, base, index, scale, reg) \

+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"

+#define MEMOPMEM(opcode, reg, offset, base, index, scale) \

+    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"

+#define MEMOPARG(opcode, offset, base, index, scale, arg) \

+    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"

+#endif

+void I444ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width);

+void I422ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width);

+void I411ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width);

+void I422ToBGRARow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_bgra,

+                        int width);

+void I422ToABGRRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_abgr,

+                        int width);

+void I422ToRGBARow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_rgba,

+                        int width);

+void I422ToRGB24Row_NEON(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_rgb24,

+                         int width);

+void I422ToRAWRow_NEON(const uint8* src_y,

+                       const uint8* src_u,

+                       const uint8* src_v,

+                       uint8* dst_raw,

+                       int width);

+void I422ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_u,

+                          const uint8* src_v,

+                          uint8* dst_rgb565,

+                          int width);

+void I422ToARGB1555Row_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb1555,

+                            int width);

+void I422ToARGB4444Row_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb4444,

+                            int width);

+void NV12ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_uv,

+                        uint8* dst_argb,

+                        int width);

+void NV21ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_vu,

+                        uint8* dst_argb,

+                        int width);

+void NV12ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_uv,

+                          uint8* dst_rgb565,

+                          int width);

+void NV21ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_vu,

+                          uint8* dst_rgb565,

+                          int width);

+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

+                        uint8* dst_argb,

+                        int width);

+void UYVYToARGBRow_NEON(const uint8* src_uyvy,

+                        uint8* dst_argb,

+                        int width);

+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);

+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);

+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);

+void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);

+void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);

+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);

+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);

+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);

+void RGB24ToYRow_Unaligned_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);

+void RAWToYRow_Unaligned_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);

+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix);

+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix);

+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix);

+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int pix);

+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

+                       uint8* dst_u, uint8* dst_v, int pix);

+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

+                     uint8* dst_u, uint8* dst_v, int pix);

+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

+                        uint8* dst_u, uint8* dst_v, int pix);

+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix);

+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix);

+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix);

+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);

+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix);

+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);

+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);

+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);

+void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int pix);

+void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int pix);

+void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int pix);

+void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int pix);

+void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int pix);

+void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int pix);

+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int pix);

+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int pix);

+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int pix);

+void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

+void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);

+void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);

+void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix);

+void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int pix);

+void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int pix);

+void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);

+void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix);

+void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int pix);

+void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int pix);

+void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int pix);

+void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int pix);

+void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int pix);

+void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);

+void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);

+void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);

+void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,

+                          uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,

+                        uint8* dst_u, uint8* dst_v, int width);

+void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,

+                       uint8* dst_u, uint8* dst_v, int width);

+void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,

+                       uint8* dst_u, uint8* dst_v, int width);

+void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,

+                       uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,

+                                 uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,

+                                  uint8* dst_u, uint8* dst_v, int width);

+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra, int src_stride_bgra,

+                                 uint8* dst_u, uint8* dst_v, int width);

+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr, int src_stride_abgr,

+                                 uint8* dst_u, uint8* dst_v, int width);

+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba, int src_stride_rgba,

+                                 uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,

+                           uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,

+                            uint8* dst_u, uint8* dst_v, int width);

+void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,

+                           uint8* dst_u, uint8* dst_v, int width);

+void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,

+                           uint8* dst_u, uint8* dst_v, int width);

+void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,

+                           uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                             int pix);

+void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                             int pix);

+void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                             int pix);

+void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,

+                           uint8* dst_u, uint8* dst_v, int pix);

+void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,

+                           uint8* dst_u, uint8* dst_v, int pix);

+void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,

+                            uint8* dst_u, uint8* dst_v, int pix);

+void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,

+                              int src_stride_argb1555,

+                              uint8* dst_u, uint8* dst_v, int pix);

+void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,

+                              int src_stride_argb4444,

+                              uint8* dst_u, uint8* dst_v, int pix);

+void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,

+                   uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_u, uint8* dst_v, int width);

+void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,

+                   uint8* dst_u, uint8* dst_v, int width);

+void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,

+                   uint8* dst_u, uint8* dst_v, int width);

+void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,

+                   uint8* dst_u, uint8* dst_v, int width);

+void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,

+                    uint8* dst_u, uint8* dst_v, int width);

+void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,

+                  uint8* dst_u, uint8* dst_v, int width);

+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,

+                     uint8* dst_u, uint8* dst_v, int width);

+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,

+                       uint8* dst_u, uint8* dst_v, int width);

+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,

+                       uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV444Row_SSSE3(const uint8* src_argb,

+                          uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb,

+                                    uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,

+                              uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV422Row_SSSE3(const uint8* src_argb,

+                          uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb,

+                                    uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV422Row_Any_SSSE3(const uint8* src_argb,

+                              uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV444Row_C(const uint8* src_argb,

+                      uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV422Row_C(const uint8* src_argb,

+                      uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV411Row_C(const uint8* src_argb,

+                      uint8* dst_u, uint8* dst_v, int width);

+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);

+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);

+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);

+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);

+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);

+void MirrorRow_C(const uint8* src, uint8* dst, int width);

+void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                       int width);

+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                      int width);

+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                            int width);

+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                   int width);

+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);

+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width);

+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);

+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);

+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);

+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                           int pix);

+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                               int pix);

+void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,

+                                     uint8* dst_v, int pix);

+void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                         int pix);

+void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                         int pix);

+void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                         int pix);

+void SplitUVRow_Any_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                               int pix);

+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                  int width);

+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width);

+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width);

+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width);

+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,

+                               uint8* dst_uv, int width);

+void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                         int width);

+void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                         int width);

+void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                         int width);

+void CopyRow_SSE2(const uint8* src, uint8* dst, int count);

+void CopyRow_ERMS(const uint8* src, uint8* dst, int count);

+void CopyRow_X86(const uint8* src, uint8* dst, int count);

+void CopyRow_NEON(const uint8* src, uint8* dst, int count);

+void CopyRow_MIPS(const uint8* src, uint8* dst, int count);

+void CopyRow_C(const uint8* src, uint8* dst, int count);

+void CopyRow_16_C(const uint16* src, uint16* dst, int count);

+void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);

+void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);

+void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);

+void SetRow_X86(uint8* dst, uint32 v32, int count);

+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,

+                     int dst_stride, int height);

+void SetRow_NEON(uint8* dst, uint32 v32, int count);

+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,

+                      int dst_stride, int height);

+void SetRow_C(uint8* dst, uint32 v32, int count);

+void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride,

+                   int height);

+// ARGBShufflers for BGRAToARGB etc.

+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,

+                      const uint8* shuffler, int pix);

+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix);

+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                          const uint8* shuffler, int pix);

+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix);

+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix);

+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                                    const uint8* shuffler, int pix);

+void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

+                             const uint8* shuffler, int pix);

+void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                              const uint8* shuffler, int pix);

+void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

+                             const uint8* shuffler, int pix);

+void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,

+                             const uint8* shuffler, int pix);

+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);

+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);

+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix);

+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,

+                            int pix);

+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,

+                            int pix);

+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);

+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix);

+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix);

+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

+                            int pix);

+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

+                            int pix);

+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix);

+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix);

+void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix);

+void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);

+void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix);

+void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix);

+void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix);

+void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,

+                              int pix);

+void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,

+                                int pix);

+void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,

+                                int pix);

+void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix);

+void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix);

+void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,

+                              int pix);

+void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,

+                                int pix);

+void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,

+                                int pix);

+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix);

+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);

+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix);

+void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix);

+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);

+void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix);

+void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix);

+void I444ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_argb,

+                     int width);

+void I422ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_argb,

+                     int width);

+void I411ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_argb,

+                     int width);

+void NV12ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_uv,

+                     uint8* dst_argb,

+                     int width);

+void NV21ToRGB565Row_C(const uint8* src_y,

+                       const uint8* src_vu,

+                       uint8* dst_argb,

+                       int width);

+void NV12ToRGB565Row_C(const uint8* src_y,

+                       const uint8* src_uv,

+                       uint8* dst_argb,

+                       int width);

+void NV21ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_vu,

+                     uint8* dst_argb,

+                     int width);

+void YUY2ToARGBRow_C(const uint8* src_yuy2,

+                     uint8* dst_argb,

+                     int width);

+void UYVYToARGBRow_C(const uint8* src_uyvy,

+                     uint8* dst_argb,

+                     int width);

+void I422ToBGRARow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_bgra,

+                     int width);

+void I422ToABGRRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_abgr,

+                     int width);

+void I422ToRGBARow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_rgba,

+                     int width);

+void I422ToRGB24Row_C(const uint8* src_y,

+                      const uint8* src_u,

+                      const uint8* src_v,

+                      uint8* dst_rgb24,

+                      int width);

+void I422ToRAWRow_C(const uint8* src_y,

+                    const uint8* src_u,

+                    const uint8* src_v,

+                    uint8* dst_raw,

+                    int width);

+void I422ToARGB4444Row_C(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb4444,

+                         int width);

+void I422ToARGB1555Row_C(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb4444,

+                         int width);

+void I422ToRGB565Row_C(const uint8* src_y,

+                       const uint8* src_u,

+                       const uint8* src_v,

+                       uint8* dst_rgb565,

+                       int width);

+void YToARGBRow_C(const uint8* src_y,

+                  uint8* dst_argb,

+                  int width);

+void I422ToARGBRow_AVX2(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width);

+void I444ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb,

+                         int width);

+void I422ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb,

+                         int width);

+void I411ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb,

+                         int width);

+void NV12ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_uv,

+                         uint8* dst_argb,

+                         int width);

+void NV21ToARGBRow_SSSE3(const uint8* src_y,

+                         const uint8* src_vu,

+                         uint8* dst_argb,

+                         int width);

+void NV12ToRGB565Row_SSSE3(const uint8* src_y,

+                           const uint8* src_uv,

+                           uint8* dst_argb,

+                           int width);

+void NV21ToRGB565Row_SSSE3(const uint8* src_y,

+                           const uint8* src_vu,

+                           uint8* dst_argb,

+                           int width);

+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,

+                         uint8* dst_argb,

+                         int width);

+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,

+                         uint8* dst_argb,

+                         int width);

+void I422ToBGRARow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_bgra,

+                         int width);

+void I422ToABGRRow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_abgr,

+                         int width);

+void I422ToRGBARow_SSSE3(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_rgba,

+                         int width);

+void I422ToARGB4444Row_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_argb,

+                             int width);

+void I422ToARGB1555Row_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_argb,

+                             int width);

+void I422ToRGB565Row_SSSE3(const uint8* src_y,

+                           const uint8* src_u,

+                           const uint8* src_v,

+                           uint8* dst_argb,

+                           int width);

+// RGB24/RAW are unaligned.

+void I422ToRGB24Row_SSSE3(const uint8* src_y,

+                          const uint8* src_u,

+                          const uint8* src_v,

+                          uint8* dst_rgb24,

+                          int width);

+void I422ToRAWRow_SSSE3(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_raw,

+                        int width);

+void I444ToARGBRow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_u,

+                                   const uint8* src_v,

+                                   uint8* dst_argb,

+                                   int width);

+void I422ToARGBRow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_u,

+                                   const uint8* src_v,

+                                   uint8* dst_argb,

+                                   int width);

+void I411ToARGBRow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_u,

+                                   const uint8* src_v,

+                                   uint8* dst_argb,

+                                   int width);

+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_uv,

+                                   uint8* dst_argb,

+                                   int width);

+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_vu,

+                                   uint8* dst_argb,

+                                   int width);

+void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,

+                                   uint8* dst_argb,

+                                   int width);

+void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,

+                                   uint8* dst_argb,

+                                   int width);

+void I422ToBGRARow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_u,

+                                   const uint8* src_v,

+                                   uint8* dst_bgra,

+                                   int width);

+void I422ToABGRRow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_u,

+                                   const uint8* src_v,

+                                   uint8* dst_abgr,

+                                   int width);

+void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y,

+                                   const uint8* src_u,

+                                   const uint8* src_v,

+                                   uint8* dst_rgba,

+                                   int width);

+void I422ToARGBRow_Any_AVX2(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void I444ToARGBRow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_argb,

+                             int width);

+void I422ToARGBRow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_argb,

+                             int width);

+void I411ToARGBRow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_argb,

+                             int width);

+void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_uv,

+                             uint8* dst_argb,

+                             int width);

+void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_vu,

+                             uint8* dst_argb,

+                             int width);

+void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,

+                               const uint8* src_uv,

+                               uint8* dst_argb,

+                               int width);

+void NV21ToRGB565Row_Any_SSSE3(const uint8* src_y,

+                               const uint8* src_vu,

+                               uint8* dst_argb,

+                               int width);

+void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,

+                             uint8* dst_argb,

+                             int width);

+void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,

+                             uint8* dst_argb,

+                             int width);

+void I422ToBGRARow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_bgra,

+                             int width);

+void I422ToABGRRow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_abgr,

+                             int width);

+void I422ToRGBARow_Any_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_rgba,

+                             int width);

+void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,

+                                 const uint8* src_u,

+                                 const uint8* src_v,

+                                 uint8* dst_rgba,

+                                 int width);

+void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,

+                                 const uint8* src_u,

+                                 const uint8* src_v,

+                                 uint8* dst_rgba,

+                                 int width);

+void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,

+                               const uint8* src_u,

+                               const uint8* src_v,

+                               uint8* dst_rgba,

+                               int width);

+// RGB24/RAW are unaligned.

+void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void I422ToRAWRow_Any_SSSE3(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void YToARGBRow_SSE2(const uint8* src_y,

+                     uint8* dst_argb,

+                     int width);

+void YToARGBRow_NEON(const uint8* src_y,

+                     uint8* dst_argb,

+                     int width);

+void YToARGBRow_Any_SSE2(const uint8* src_y,

+                         uint8* dst_argb,

+                         int width);

+void YToARGBRow_Any_NEON(const uint8* src_y,

+                         uint8* dst_argb,

+                         int width);

+// ARGB preattenuated alpha blend.

+void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,

+                        uint8* dst_argb, int width);

+void ARGBBlendRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

+                       uint8* dst_argb, int width);

+void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,

+                       uint8* dst_argb, int width);

+void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,

+                    uint8* dst_argb, int width);

+// ARGB multiply images. Same API as Blend, but these require

+// pointer and width alignment for SSE2.

+void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,

+                       uint8* dst_argb, int width);

+void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

+                          uint8* dst_argb, int width);

+void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,

+                              uint8* dst_argb, int width);

+void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,

+                          uint8* dst_argb, int width);

+void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,

+                              uint8* dst_argb, int width);

+void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,

+                          uint8* dst_argb, int width);

+void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,

+                              uint8* dst_argb, int width);

+// ARGB add images.

+void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,

+                  uint8* dst_argb, int width);

+void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

+                     uint8* dst_argb, int width);

+void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,

+                         uint8* dst_argb, int width);

+void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,

+                     uint8* dst_argb, int width);

+void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,

+                         uint8* dst_argb, int width);

+void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,

+                     uint8* dst_argb, int width);

+void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,

+                         uint8* dst_argb, int width);

+// ARGB subtract images. Same API as Blend, but these require

+// pointer and width alignment for SSE2.

+void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,

+                       uint8* dst_argb, int width);

+void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

+                          uint8* dst_argb, int width);

+void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,

+                              uint8* dst_argb, int width);

+void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,

+                          uint8* dst_argb, int width);

+void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,

+                              uint8* dst_argb, int width);

+void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,

+                          uint8* dst_argb, int width);

+void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,

+                              uint8* dst_argb, int width);

+void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix);

+void I444ToARGBRow_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void I422ToARGBRow_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void I411ToARGBRow_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void I422ToBGRARow_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void I422ToABGRRow_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void I422ToRGBARow_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb,

+                            int width);

+void I422ToRGB24Row_Any_NEON(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* dst_argb,

+                             int width);

+void I422ToRAWRow_Any_NEON(const uint8* src_y,

+                           const uint8* src_u,

+                           const uint8* src_v,

+                           uint8* dst_argb,

+                           int width);

+void I422ToARGB4444Row_Any_NEON(const uint8* src_y,

+                                const uint8* src_u,

+                                const uint8* src_v,

+                                uint8* dst_argb,

+                                int width);

+void I422ToARGB1555Row_Any_NEON(const uint8* src_y,

+                                const uint8* src_u,

+                                const uint8* src_v,

+                                uint8* dst_argb,

+                                int width);

+void I422ToRGB565Row_Any_NEON(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void NV12ToARGBRow_Any_NEON(const uint8* src_y,

+                            const uint8* src_uv,

+                            uint8* dst_argb,

+                            int width);

+void NV21ToARGBRow_Any_NEON(const uint8* src_y,

+                            const uint8* src_uv,

+                            uint8* dst_argb,

+                            int width);

+void NV12ToRGB565Row_Any_NEON(const uint8* src_y,

+                              const uint8* src_uv,

+                              uint8* dst_argb,

+                              int width);

+void NV21ToRGB565Row_Any_NEON(const uint8* src_y,

+                              const uint8* src_uv,

+                              uint8* dst_argb,

+                              int width);

+void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,

+                            uint8* dst_argb,

+                            int width);

+void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,

+                            uint8* dst_argb,

+                            int width);

+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void I422ToARGBRow_MIPS_DSPR2(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void I422ToBGRARow_MIPS_DSPR2(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void I422ToABGRRow_MIPS_DSPR2(const uint8* src_y,

+                              const uint8* src_u,

+                              const uint8* src_v,

+                              uint8* dst_argb,

+                              int width);

+void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,

+                               uint8* dst_y, int pix);

+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,

+                                uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,

+                                   uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_NEON(const uint8* src_yuy2,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,

+                   uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_C(const uint8* src_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,

+                             uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,

+                             uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix);

+void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,

+                             uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,

+                               uint8* dst_y, int pix);

+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,

+                                uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,

+                                   uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_NEON(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,

+                   uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_C(const uint8* src_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,

+                             uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,

+                             uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix);

+void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,

+                          uint8* dst_u, uint8* dst_v, int pix);

+void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,

+                             uint8* dst_u, uint8* dst_v, int pix);

+void HalfRow_C(const uint8* src_uv, int src_uv_stride,

+               uint8* dst_uv, int pix);

+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix);

+void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix);

+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix);

+void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,

+                  uint16* dst_uv, int pix);

+void ARGBToBayerRow_C(const uint8* src_argb, uint8* dst_bayer,

+                      uint32 selector, int pix);

+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,

+                          uint32 selector, int pix);

+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,

+                         uint32 selector, int pix);

+void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, uint8* dst_bayer,

+                              uint32 selector, int pix);

+void ARGBToBayerRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,

+                             uint32 selector, int pix);

+void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,

+                        uint32 /* selector */, int pix);

+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,

+                           uint32 /* selector */, int pix);

+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,

+                           uint32 /* selector */, int pix);

+void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,

+                               uint32 /* selector */, int pix);

+void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,

+                               uint32 /* selector */, int pix);

+void I422ToYUY2Row_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_yuy2, int width);

+void I422ToUYVYRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_uyvy, int width);

+void I422ToYUY2Row_SSE2(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_yuy2, int width);

+void I422ToUYVYRow_SSE2(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_uyvy, int width);

+void I422ToYUY2Row_Any_SSE2(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_yuy2, int width);

+void I422ToUYVYRow_Any_SSE2(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_uyvy, int width);

+void I422ToYUY2Row_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_yuy2, int width);

+void I422ToUYVYRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_uyvy, int width);

+void I422ToYUY2Row_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_yuy2, int width);

+void I422ToUYVYRow_Any_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_uyvy, int width);

+// Effects related row functions.

+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

+                               int width);

+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                                int width);

+void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

+                               int width);

+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,

+                               int width);

+// Inverse table for unattenuate, shared by C and SSE2.

+extern const uint32 fixed_invtbl8[256];

+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

+                                 int width);

+void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

+                                 int width);

+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBSepiaRow_C(uint8* dst_argb, int width);

+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);

+void ARGBSepiaRow_NEON(uint8* dst_argb, int width);

+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,

+                          const int8* matrix_argb, int width);

+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                              const int8* matrix_argb, int width);

+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

+                             const int8* matrix_argb, int width);

+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);

+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);

+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);

+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);

+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,

+                       int interval_offset, int width);

+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,

+                          int interval_offset, int width);

+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,

+                          int interval_offset, int width);

+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,

+                    uint32 value);

+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,

+                       uint32 value);

+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,

+                       uint32 value);

+// Used for blur.

+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,

+                                    int width, int area, uint8* dst, int count);

+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,

+                                  const int32* previous_cumsum, int width);

+void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,

+                                 int width, int area, uint8* dst, int count);

+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,

+                               const int32* previous_cumsum, int width);

+LIBYUV_API

+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,

+                     uint8* dst_argb, const float* uv_dudv, int width);

+LIBYUV_API

+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

+                        uint8* dst_argb, const float* uv_dudv, int width);

+// Used for I420Scale, ARGBScale, and ARGBInterpolate.

+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,

+                      ptrdiff_t src_stride_ptr,

+                      int width, int source_y_fraction);

+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                         ptrdiff_t src_stride_ptr, int width,

+                         int source_y_fraction);

+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                          ptrdiff_t src_stride_ptr, int width,

+                          int source_y_fraction);

+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,

+                         ptrdiff_t src_stride_ptr, int width,

+                         int source_y_fraction);

+void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,

+                         ptrdiff_t src_stride_ptr, int width,

+                         int source_y_fraction);

+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

+                                ptrdiff_t src_stride_ptr, int width,

+                                int source_y_fraction);

+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                                   ptrdiff_t src_stride_ptr, int width,

+                                   int source_y_fraction);

+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                                    ptrdiff_t src_stride_ptr, int width,

+                                    int source_y_fraction);

+void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,

+                             ptrdiff_t src_stride_ptr, int width,

+                             int source_y_fraction);

+void InterpolateRow_Any_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                             ptrdiff_t src_stride_ptr, int width,

+                             int source_y_fraction);

+void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                              ptrdiff_t src_stride_ptr, int width,

+                              int source_y_fraction);

+void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,

+                             ptrdiff_t src_stride_ptr, int width,

+                             int source_y_fraction);

+void InterpolateRows_Any_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

+                                    ptrdiff_t src_stride_ptr, int width,

+                                    int source_y_fraction);

+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                         ptrdiff_t src_stride_ptr,

+                         int width, int source_y_fraction);

+// Sobel images.

+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,

+                 uint8* dst_sobelx, int width);

+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,

+                    const uint8* src_y2, uint8* dst_sobelx, int width);

+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

+                    const uint8* src_y2, uint8* dst_sobelx, int width);

+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,

+                 uint8* dst_sobely, int width);

+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,

+                    uint8* dst_sobely, int width);

+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

+                    uint8* dst_sobely, int width);

+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,

+                uint8* dst_argb, int width);

+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                   uint8* dst_argb, int width);

+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                   uint8* dst_argb, int width);

+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,

+                       uint8* dst_y, int width);

+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                          uint8* dst_y, int width);

+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                          uint8* dst_y, int width);

+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,

+                  uint8* dst_argb, int width);

+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width);

+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width);

+void ARGBPolynomialRow_C(const uint8* src_argb,

+                         uint8* dst_argb, const float* poly,

+                         int width);

+void ARGBPolynomialRow_SSE2(const uint8* src_argb,

+                            uint8* dst_argb, const float* poly,

+                            int width);

+void ARGBPolynomialRow_AVX2(const uint8* src_argb,

+                            uint8* dst_argb, const float* poly,

+                            int width);

+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,

+                             const uint8* luma, uint32 lumacoeff);

+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                                 int width,

+                                 const uint8* luma, uint32 lumacoeff);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT

--- a/third_party/libyuv/include/libyuv/scale.h

+++ b/third_party/libyuv/include/libyuv/scale.h

@@ -1,17 +1,17 @@

/*

- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

  *  Use of this source code is governed by a BSD-style license

  *  that can be found in the LICENSE file in the root of the source

  *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

+ *  in the file PATENTS. All contributing project authors may

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_SCALE_H_

+#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT

 #define INCLUDE_LIBYUV_SCALE_H_

-#include "third_party/libyuv/include/libyuv/basic_types.h"

+#include "basic_types.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -18,13 +18,28 @@

 extern "C" {

 #endif

-// Supported filtering

-typedef enum {

-  kFilterNone = 0,  // Point sample; Fastest

-  kFilterBilinear = 1,  // Faster than box, but lower quality scaling down.

-  kFilterBox = 2  // Highest quality

+// Supported filtering.

+typedef enum FilterMode {

+  kFilterNone = 0,  // Point sample; Fastest.

+  kFilterLinear = 1,  // Filter horizontally only.

+  kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.

+  kFilterBox = 3  // Highest quality.

 } FilterModeEnum;

+// Scale a YUV plane.

+LIBYUV_API

+void ScalePlane(const uint8* src, int src_stride,

+                int src_width, int src_height,

+                uint8* dst, int dst_stride,

+                int dst_width, int dst_height,

+                enum FilterMode filtering);

+void ScalePlane_16(const uint16* src, int src_stride,

+                   int src_width, int src_height,

+                   uint16* dst, int dst_stride,

+                   int dst_width, int dst_height,

+                   enum FilterMode filtering);

 // Scales a YUV 4:2:0 image from the src width and height to the

 // dst width and height.

 // If filtering is kFilterNone, a simple nearest-neighbor algorithm is

@@ -35,6 +50,7 @@

 // quality image, at further expense of speed.

 // Returns 0 if successful.

+LIBYUV_API

 int I420Scale(const uint8* src_y, int src_stride_y,

               const uint8* src_u, int src_stride_u,

               const uint8* src_v, int src_stride_v,

@@ -43,9 +59,22 @@

               uint8* dst_u, int dst_stride_u,

               uint8* dst_v, int dst_stride_v,

               int dst_width, int dst_height,

-              FilterModeEnum filtering);

+              enum FilterMode filtering);

-// Legacy API.  Deprecated

+LIBYUV_API

+int I420Scale_16(const uint16* src_y, int src_stride_y,

+                 const uint16* src_u, int src_stride_u,

+                 const uint16* src_v, int src_stride_v,

+                 int src_width, int src_height,

+                 uint16* dst_y, int dst_stride_y,

+                 uint16* dst_u, int dst_stride_u,

+                 uint16* dst_v, int dst_stride_v,

+                 int dst_width, int dst_height,

+                 enum FilterMode filtering);

+#ifdef __cplusplus

+// Legacy API.  Deprecated.

+LIBYUV_API

 int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,

           int src_stride_y, int src_stride_u, int src_stride_v,

           int src_width, int src_height,

@@ -52,15 +81,18 @@

           uint8* dst_y, uint8* dst_u, uint8* dst_v,

           int dst_stride_y, int dst_stride_u, int dst_stride_v,

           int dst_width, int dst_height,

-          int interpolate);

+          LIBYUV_BOOL interpolate);

-// Legacy API.  Deprecated

-int ScaleOffset(const uint8* src, int src_width, int src_height,

-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,

-                int interpolate);

+// Legacy API.  Deprecated.

+LIBYUV_API

+int ScaleOffset(const uint8* src_i420, int src_width, int src_height,

+                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,

+                LIBYUV_BOOL interpolate);

-// For testing, allow disabling of optimizations.

-void SetUseReferenceImpl(int use);

+// For testing, allow disabling of specialized scalers.

+LIBYUV_API

+void SetUseReferenceImpl(LIBYUV_BOOL use);

+#endif  // __cplusplus

 #ifdef __cplusplus

 }  // extern "C"

@@ -67,4 +99,4 @@

 }  // namespace libyuv

 #endif

-#endif // INCLUDE_LIBYUV_SCALE_H_

+#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/scale_row.h

@@ -1,0 +1,341 @@

+/*

+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT

+#define INCLUDE_LIBYUV_SCALE_ROW_H_

+#include "basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \

+    defined(TARGET_IPHONE_SIMULATOR)

+#define LIBYUV_DISABLE_X86

+#endif

+// The following are available on all x86 platforms:

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

+#define HAS_SCALEROWDOWN2_SSE2

+#define HAS_SCALEROWDOWN4_SSE2

+#define HAS_SCALEROWDOWN34_SSSE3

+#define HAS_SCALEROWDOWN38_SSSE3

+#define HAS_SCALEADDROWS_SSE2

+#define HAS_SCALEFILTERCOLS_SSSE3

+#define HAS_SCALECOLSUP2_SSE2

+#define HAS_SCALEARGBROWDOWN2_SSE2

+#define HAS_SCALEARGBROWDOWNEVEN_SSE2

+#define HAS_SCALEARGBCOLS_SSE2

+#define HAS_SCALEARGBFILTERCOLS_SSSE3

+#define HAS_SCALEARGBCOLSUP2_SSE2

+#define HAS_FIXEDDIV_X86

+#define HAS_FIXEDDIV1_X86

+#endif

+// The following are available on Neon platforms:

+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

+#define HAS_SCALEROWDOWN2_NEON

+#define HAS_SCALEROWDOWN4_NEON

+#define HAS_SCALEROWDOWN34_NEON

+#define HAS_SCALEROWDOWN38_NEON

+#define HAS_SCALEARGBROWDOWNEVEN_NEON

+#define HAS_SCALEARGBROWDOWN2_NEON

+#endif

+// The following are available on Mips platforms:

+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \

+    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)

+#define HAS_SCALEROWDOWN2_MIPS_DSPR2

+#define HAS_SCALEROWDOWN4_MIPS_DSPR2

+#define HAS_SCALEROWDOWN34_MIPS_DSPR2

+#define HAS_SCALEROWDOWN38_MIPS_DSPR2

+#endif

+// Scale ARGB vertically with bilinear interpolation.

+void ScalePlaneVertical(int src_height,

+                        int dst_width, int dst_height,

+                        int src_stride, int dst_stride,

+                        const uint8* src_argb, uint8* dst_argb,

+                        int x, int y, int dy,

+                        int bpp, enum FilterMode filtering);

+void ScalePlaneVertical_16(int src_height,

+                           int dst_width, int dst_height,

+                           int src_stride, int dst_stride,

+                           const uint16* src_argb, uint16* dst_argb,

+                           int x, int y, int dy,

+                           int wpp, enum FilterMode filtering);

+// Simplify the filtering based on scale factors.

+enum FilterMode ScaleFilterReduce(int src_width, int src_height,

+                                  int dst_width, int dst_height,

+                                  enum FilterMode filtering);

+// Divide num by div and return as 16.16 fixed point result.

+int FixedDiv_C(int num, int div);

+int FixedDiv_X86(int num, int div);

+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.

+int FixedDiv1_C(int num, int div);

+int FixedDiv1_X86(int num, int div);

+#ifdef HAS_FIXEDDIV_X86

+#define FixedDiv FixedDiv_X86

+#define FixedDiv1 FixedDiv1_X86

+#else

+#define FixedDiv FixedDiv_C

+#define FixedDiv1 FixedDiv1_C

+#endif

+// Compute slope values for stepping.

+void ScaleSlope(int src_width, int src_height,

+                int dst_width, int dst_height,

+                enum FilterMode filtering,

+                int* x, int* y, int* dx, int* dy);

+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                     uint8* dst, int dst_width);

+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                        uint16* dst, int dst_width);

+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst, int dst_width);

+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                              uint16* dst, int dst_width);

+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst, int dst_width);

+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst, int dst_width);

+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                     uint8* dst, int dst_width);

+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                        uint16* dst, int dst_width);

+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst, int dst_width);

+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst, int dst_width);

+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                      uint8* dst, int dst_width);

+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                         uint16* dst, int dst_width);

+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* d, int dst_width);

+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                               uint16* d, int dst_width);

+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* d, int dst_width);

+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                               uint16* d, int dst_width);

+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,

+                 int dst_width, int x, int dx);

+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                    int dst_width, int x, int dx);

+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,

+                    int dst_width, int, int);

+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                       int dst_width, int, int);

+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,

+                       int dst_width, int x, int dx);

+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                          int dst_width, int x, int dx);

+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,

+                         int dst_width, int x, int dx);

+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                            int dst_width, int x, int dx);

+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                      uint8* dst, int dst_width);

+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                         uint16* dst, int dst_width);

+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint16* dst_ptr, int dst_width);

+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                               uint16* dst_ptr, int dst_width);

+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                    uint16* dst_ptr, int src_width, int src_height);

+void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                       uint32* dst_ptr, int src_width, int src_height);

+void ScaleARGBRowDown2_C(const uint8* src_argb,

+                         ptrdiff_t src_stride,

+                         uint8* dst_argb, int dst_width);

+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,

+                               ptrdiff_t src_stride,

+                               uint8* dst_argb, int dst_width);

+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,

+                            uint8* dst_argb, int dst_width);

+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,

+                            int src_stepx,

+                            uint8* dst_argb, int dst_width);

+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,

+                               ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8* dst_argb, int dst_width);

+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,

+                     int dst_width, int x, int dx);

+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,

+                       int dst_width, int x, int dx);

+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,

+                        int dst_width, int, int);

+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,

+                           int dst_width, int x, int dx);

+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,

+                             int dst_width, int x, int dx);

+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,

+                                        ptrdiff_t src_stride,

+                                        uint8* dst_ptr, int dst_width);

+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,

+                                     ptrdiff_t src_stride,

+                                     uint8* dst_ptr, int dst_width);

+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width);

+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width);

+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst_ptr, int dst_width);

+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width);

+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                       uint16* dst_ptr, int src_width,

+                       int src_height);

+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                           int dst_width, int x, int dx);

+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                       int dst_width, int x, int dx);

+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

+                            ptrdiff_t src_stride,

+                            uint8* dst_argb, int dst_width);

+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

+                                  ptrdiff_t src_stride,

+                                  uint8* dst_argb, int dst_width);

+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

+                               ptrdiff_t src_stride,

+                               uint8* dst_argb, int dst_width);

+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8* dst_argb, int dst_width);

+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

+                                  ptrdiff_t src_stride,

+                                  int src_stepx,

+                                  uint8* dst_argb, int dst_width);

+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

+                        int dst_width, int x, int dx);

+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

+                               int dst_width, int x, int dx);

+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

+                           int dst_width, int x, int dx);

+// Row functions.

+void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride,

+                               int src_stepx,

+                               uint8* dst_argb, int dst_width);

+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride,

+                                  int src_stepx,

+                                  uint8* dst_argb, int dst_width);

+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width);

+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst, int dst_width);

+// ScaleRowDown2Box also used by planar functions

+// NEON downscalers with interpolation.

+// Note - not static due to reuse in convert for 444 to 420.

+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst, int dst_width);

+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst, int dst_width);

+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width);

+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width);

+// Down scale from 4 to 3 pixels. Use the neon multilane read/write

+//  to load up the every 4th pixel into a 4 different registers.

+// Point samples 32 pixels to 24 pixels.

+void ScaleRowDown34_NEON(const uint8* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width);

+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width);

+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width);

+// 32 -> 12

+void ScaleRowDown38_NEON(const uint8* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width);

+// 32x3 -> 12x1

+void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width);

+// 32x2 -> 12x1

+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst, int dst_width);

+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                 uint8* dst, int dst_width);

+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst, int dst_width);

+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                 uint8* dst, int dst_width);

+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst, int dst_width);

+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                     uint8* d, int dst_width);

+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                     uint8* d, int dst_width);

+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst, int dst_width);

+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                     uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,

+                                     ptrdiff_t src_stride,

+                                     uint8* dst_ptr, int dst_width);

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT

--- a/third_party/libyuv/source/cpu_id.c

+++ /dev/null

@@ -1,81 +1,0 @@

-/*

- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "third_party/libyuv/include/libyuv/cpu_id.h"

-#ifdef _MSC_VER

-#include <intrin.h>

-#endif

-#ifdef __ANDROID__

-#include <cpu-features.h>

-#endif

-#include "third_party/libyuv/include/libyuv/basic_types.h"  // for CPU_X86

-// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.

-#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)

-static inline void __cpuid(int cpu_info[4], int info_type) {

-  asm volatile (

-    "mov %%ebx, %%edi                          \n"

-    "cpuid                                     \n"

-    "xchg %%edi, %%ebx                         \n"

-    : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])

-    : "a"(info_type)

-  );

-}

-#elif defined(__i386__) || defined(__x86_64__)

-static inline void __cpuid(int cpu_info[4], int info_type) {

-  asm volatile (

-    "cpuid                                     \n"

-    : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])

-    : "a"(info_type)

-  );

-}

-#endif

-#ifdef __cplusplus

-namespace libyuv {

-extern "C" {

-#endif

-// CPU detect function for SIMD instruction sets.

-int cpu_info_ = 0;

-int InitCpuFlags() {

-#ifdef CPU_X86

-  int cpu_info[4];

-  __cpuid(cpu_info, 1);

-  cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |

-              (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |

-              kCpuInitialized;

-#elif defined(__ANDROID__) && defined(__ARM_NEON__)

-  uint64_t features = android_getCpuFeatures();

-  cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |

-              kCpuInitialized;

-#elif defined(__ARM_NEON__)

-  // gcc -mfpu=neon defines __ARM_NEON__

-  // Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags

-  // to disable Neon on devices that do not have it.

-  cpu_info_ = kCpuHasNEON | kCpuInitialized;

-#else

-  cpu_info_ = kCpuInitialized;

-#endif

-  return cpu_info_;

-}

-void MaskCpuFlags(int enable_flags) {

-  InitCpuFlags();

-  cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;

-}

-#ifdef __cplusplus

-}  // extern "C"

-}  // namespace libyuv

-#endif

--- /dev/null

+++ b/third_party/libyuv/source/cpu_id.cc

@@ -1,0 +1,283 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/cpu_id.h"

+#ifdef _MSC_VER

+#include <intrin.h>  // For __cpuidex()

+#endif

+#if !defined(__pnacl__) && !defined(__CLR_VER) && \

+    !defined(__native_client__) && defined(_M_X64) && \

+    defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)

+#include <immintrin.h>  // For _xgetbv()

+#endif

+#if !defined(__native_client__)

+#include <stdlib.h>  // For getenv()

+#endif

+// For ArmCpuCaps() but unittested on all platforms

+#include <stdio.h>

+#include <string.h>

+#include "third_party/libyuv/include/libyuv/basic_types.h"  // For CPU_X86

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// For functions that use the stack and have runtime checks for overflow,

+// use SAFEBUFFERS to avoid additional check.

+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)

+#define SAFEBUFFERS __declspec(safebuffers)

+#else

+#define SAFEBUFFERS

+#endif

+// Low level cpuid for X86. Returns zeros on other CPUs.

+#if !defined(__pnacl__) && !defined(__CLR_VER) && \

+    (defined(_M_IX86) || defined(_M_X64) || \

+    defined(__i386__) || defined(__x86_64__))

+LIBYUV_API

+void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {

+#if defined(_MSC_VER)

+#if (_MSC_FULL_VER >= 160040219)

+  __cpuidex((int*)(cpu_info), info_eax, info_ecx);

+#elif defined(_M_IX86)

+  __asm {

+    mov        eax, info_eax

+    mov        ecx, info_ecx

+    mov        edi, cpu_info

+    cpuid

+    mov        [edi], eax

+    mov        [edi + 4], ebx

+    mov        [edi + 8], ecx

+    mov        [edi + 12], edx

+  }

+#else

+  if (info_ecx == 0) {

+    __cpuid((int*)(cpu_info), info_eax);

+  } else {

+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;

+  }

+#endif

+#else  // defined(_MSC_VER)

+  uint32 info_ebx, info_edx;

+  asm volatile (  // NOLINT

+#if defined( __i386__) && defined(__PIC__)

+    // Preserve ebx for fpic 32 bit.

+    "mov %%ebx, %%edi                          \n"

+    "cpuid                                     \n"

+    "xchg %%edi, %%ebx                         \n"

+    : "=D" (info_ebx),

+#else

+    "cpuid                                     \n"

+    : "=b" (info_ebx),

+#endif  //  defined( __i386__) && defined(__PIC__)

+      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));

+  cpu_info[0] = info_eax;

+  cpu_info[1] = info_ebx;

+  cpu_info[2] = info_ecx;

+  cpu_info[3] = info_edx;

+#endif  // defined(_MSC_VER)

+}

+#if !defined(__native_client__)

+#define HAS_XGETBV

+// X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.

+int TestOsSaveYmm() {

+  uint32 xcr0 = 0u;

+#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219)

+  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.

+#elif defined(_M_IX86)

+  __asm {

+    xor        ecx, ecx    // xcr 0

+    _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0  // For VS2010 and earlier.

+    mov        xcr0, eax

+  }

+#elif defined(__i386__) || defined(__x86_64__)

+  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");

+#endif  // defined(_MSC_VER)

+  return((xcr0 & 6) == 6);  // Is ymm saved?

+}

+#endif  // !defined(__native_client__)

+#else

+LIBYUV_API

+void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {

+  cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;

+}

+#endif

+// based on libvpx arm_cpudetect.c

+// For Arm, but public to allow testing on any CPU

+LIBYUV_API SAFEBUFFERS

+int ArmCpuCaps(const char* cpuinfo_name) {

+  char cpuinfo_line[512];

+  FILE* f = fopen(cpuinfo_name, "r");

+  if (!f) {

+    // Assume Neon if /proc/cpuinfo is unavailable.

+    // This will occur for Chrome sandbox for Pepper or Render process.

+    return kCpuHasNEON;

+  }

+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {

+    if (memcmp(cpuinfo_line, "Features", 8) == 0) {

+      char* p = strstr(cpuinfo_line, " neon");

+      if (p && (p[5] == ' ' || p[5] == '\n')) {

+        fclose(f);

+        return kCpuHasNEON;

+      }

+    }

+  }

+  fclose(f);

+  return 0;

+}

+#if defined(__mips__) && defined(__linux__)

+static int MipsCpuCaps(const char* search_string) {

+  char cpuinfo_line[512];

+  const char* file_name = "/proc/cpuinfo";

+  FILE* f = fopen(file_name, "r");

+  if (!f) {

+    // Assume DSP if /proc/cpuinfo is unavailable.

+    // This will occur for Chrome sandbox for Pepper or Render process.

+    return kCpuHasMIPS_DSP;

+  }

+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f) != NULL) {

+    if (strstr(cpuinfo_line, search_string) != NULL) {

+      fclose(f);

+      return kCpuHasMIPS_DSP;

+    }

+  }

+  fclose(f);

+  return 0;

+}

+#endif

+// CPU detect function for SIMD instruction sets.

+LIBYUV_API

+int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.

+// Test environment variable for disabling CPU features. Any non-zero value

+// to disable. Zero ignored to make it easy to set the variable on/off.

+#if !defined(__native_client__) && !defined(_M_ARM)

+static LIBYUV_BOOL TestEnv(const char* name) {

+  const char* var = getenv(name);

+  if (var) {

+    if (var[0] != '0') {

+      return LIBYUV_TRUE;

+    }

+  }

+  return LIBYUV_FALSE;

+}

+#else  // nacl does not support getenv().

+static LIBYUV_BOOL TestEnv(const char*) {

+  return LIBYUV_FALSE;

+}

+#endif

+LIBYUV_API SAFEBUFFERS

+int InitCpuFlags(void) {

+#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)

+  uint32 cpu_info1[4] = { 0, 0, 0, 0 };

+  uint32 cpu_info7[4] = { 0, 0, 0, 0 };

+  CpuId(1, 0, cpu_info1);

+  CpuId(7, 0, cpu_info7);

+  cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |

+              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |

+              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |

+              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |

+              ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |

+              ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |

+              kCpuHasX86;

+#ifdef HAS_XGETBV

+  if ((cpu_info1[2] & 0x18000000) == 0x18000000 &&  // AVX and OSSave

+      TestOsSaveYmm()) {  // Saves YMM.

+    cpu_info_ |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |

+                 kCpuHasAVX;

+  }

+#endif

+  // Environment variable overrides for testing.

+  if (TestEnv("LIBYUV_DISABLE_X86")) {

+    cpu_info_ &= ~kCpuHasX86;

+  }

+  if (TestEnv("LIBYUV_DISABLE_SSE2")) {

+    cpu_info_ &= ~kCpuHasSSE2;

+  }

+  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {

+    cpu_info_ &= ~kCpuHasSSSE3;

+  }

+  if (TestEnv("LIBYUV_DISABLE_SSE41")) {

+    cpu_info_ &= ~kCpuHasSSE41;

+  }

+  if (TestEnv("LIBYUV_DISABLE_SSE42")) {

+    cpu_info_ &= ~kCpuHasSSE42;

+  }

+  if (TestEnv("LIBYUV_DISABLE_AVX")) {

+    cpu_info_ &= ~kCpuHasAVX;

+  }

+  if (TestEnv("LIBYUV_DISABLE_AVX2")) {

+    cpu_info_ &= ~kCpuHasAVX2;

+  }

+  if (TestEnv("LIBYUV_DISABLE_ERMS")) {

+    cpu_info_ &= ~kCpuHasERMS;

+  }

+  if (TestEnv("LIBYUV_DISABLE_FMA3")) {

+    cpu_info_ &= ~kCpuHasFMA3;

+  }

+#elif defined(__mips__) && defined(__linux__)

+  // Linux mips parse text file for dsp detect.

+  cpu_info_ = MipsCpuCaps("dsp");  // set kCpuHasMIPS_DSP.

+#if defined(__mips_dspr2)

+  cpu_info_ |= kCpuHasMIPS_DSPR2;

+#endif

+  cpu_info_ |= kCpuHasMIPS;

+  if (getenv("LIBYUV_DISABLE_MIPS")) {

+    cpu_info_ &= ~kCpuHasMIPS;

+  }

+  if (getenv("LIBYUV_DISABLE_MIPS_DSP")) {

+    cpu_info_ &= ~kCpuHasMIPS_DSP;

+  }

+  if (getenv("LIBYUV_DISABLE_MIPS_DSPR2")) {

+    cpu_info_ &= ~kCpuHasMIPS_DSPR2;

+  }

+#elif defined(__arm__)

+// gcc -mfpu=neon defines __ARM_NEON__

+// __ARM_NEON__ generates code that requires Neon.  NaCL also requires Neon.

+// For Linux, /proc/cpuinfo can be tested but without that assume Neon.

+#if defined(__ARM_NEON__) || defined(__native_client__) || !defined(__linux__)

+  cpu_info_ = kCpuHasNEON;

+#else

+  // Linux arm parse text file for neon detect.

+  cpu_info_ = ArmCpuCaps("/proc/cpuinfo");

+#endif

+  cpu_info_ |= kCpuHasARM;

+  if (TestEnv("LIBYUV_DISABLE_NEON")) {

+    cpu_info_ &= ~kCpuHasNEON;

+  }

+#endif  // __arm__

+  if (TestEnv("LIBYUV_DISABLE_ASM")) {

+    cpu_info_ = 0;

+  }

+  return cpu_info_;

+}

+LIBYUV_API

+void MaskCpuFlags(int enable_flags) {

+  cpu_info_ = InitCpuFlags() & enable_flags;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/planar_functions.cc

@@ -1,0 +1,2287 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/planar_functions.h"

+#include <string.h>  // for memset()

+#include "third_party/libyuv/include/libyuv/cpu_id.h"

+#ifdef HAVE_JPEG

+#include "third_party/libyuv/include/libyuv/mjpeg_decoder.h"

+#endif

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Copy a plane of data

+LIBYUV_API

+void CopyPlane(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height) {

+  int y;

+  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_y = 0;

+  }

+#if defined(HAS_COPYROW_X86)

+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {

+    CopyRow = CopyRow_X86;

+  }

+#endif

+#if defined(HAS_COPYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&

+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&

+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+    CopyRow = CopyRow_SSE2;

+  }

+#endif

+#if defined(HAS_COPYROW_ERMS)

+  if (TestCpuFlag(kCpuHasERMS)) {

+    CopyRow = CopyRow_ERMS;

+  }

+#endif

+#if defined(HAS_COPYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {

+    CopyRow = CopyRow_NEON;

+  }

+#endif

+#if defined(HAS_COPYROW_MIPS)

+  if (TestCpuFlag(kCpuHasMIPS)) {

+    CopyRow = CopyRow_MIPS;

+  }

+#endif

+  // Copy plane

+  for (y = 0; y < height; ++y) {

+    CopyRow(src_y, dst_y, width);

+    src_y += src_stride_y;

+    dst_y += dst_stride_y;

+  }

+}

+LIBYUV_API

+void CopyPlane_16(const uint16* src_y, int src_stride_y,

+                  uint16* dst_y, int dst_stride_y,

+                  int width, int height) {

+  int y;

+  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_y = 0;

+  }

+#if defined(HAS_COPYROW_16_X86)

+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {

+    CopyRow = CopyRow_16_X86;

+  }

+#endif

+#if defined(HAS_COPYROW_16_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&

+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&

+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+    CopyRow = CopyRow_16_SSE2;

+  }

+#endif

+#if defined(HAS_COPYROW_16_ERMS)

+  if (TestCpuFlag(kCpuHasERMS)) {

+    CopyRow = CopyRow_16_ERMS;

+  }

+#endif

+#if defined(HAS_COPYROW_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {

+    CopyRow = CopyRow_16_NEON;

+  }

+#endif

+#if defined(HAS_COPYROW_16_MIPS)

+  if (TestCpuFlag(kCpuHasMIPS)) {

+    CopyRow = CopyRow_16_MIPS;

+  }

+#endif

+  // Copy plane

+  for (y = 0; y < height; ++y) {

+    CopyRow(src_y, dst_y, width);

+    src_y += src_stride_y;

+    dst_y += dst_stride_y;

+  }

+}

+// Copy I422.

+LIBYUV_API

+int I422Copy(const uint8* src_y, int src_stride_y,

+             const uint8* src_u, int src_stride_u,

+             const uint8* src_v, int src_stride_v,

+             uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int width, int height) {

+  int halfwidth = (width + 1) >> 1;

+  if (!src_y || !src_u || !src_v ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (height - 1) * src_stride_u;

+    src_v = src_v + (height - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, height);

+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, height);

+  return 0;

+}

+// Copy I444.

+LIBYUV_API

+int I444Copy(const uint8* src_y, int src_stride_y,

+             const uint8* src_u, int src_stride_u,

+             const uint8* src_v, int src_stride_v,

+             uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int width, int height) {

+  if (!src_y || !src_u || !src_v ||

+      !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (height - 1) * src_stride_u;

+    src_v = src_v + (height - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);

+  CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);

+  return 0;

+}

+// Copy I400.

+LIBYUV_API

+int I400ToI400(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height) {

+  if (!src_y || !dst_y || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  return 0;

+}

+// Convert I420 to I400.

+LIBYUV_API

+int I420ToI400(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height) {

+  if (!src_y || !dst_y || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  return 0;

+}

+// Mirror a plane of data.

+void MirrorPlane(const uint8* src_y, int src_stride_y,

+                 uint8* dst_y, int dst_stride_y,

+                 int width, int height) {

+  int y;

+  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+#if defined(HAS_MIRRORROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {

+    MirrorRow = MirrorRow_NEON;

+  }

+#endif

+#if defined(HAS_MIRRORROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) {

+    MirrorRow = MirrorRow_SSE2;

+  }

+#endif

+#if defined(HAS_MIRRORROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&

+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+    MirrorRow = MirrorRow_SSSE3;

+  }

+#endif

+#if defined(HAS_MIRRORROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {

+    MirrorRow = MirrorRow_AVX2;

+  }

+#endif

+  // Mirror plane

+  for (y = 0; y < height; ++y) {

+    MirrorRow(src_y, dst_y, width);

+    src_y += src_stride_y;

+    dst_y += dst_stride_y;

+  }

+}

+// Convert YUY2 to I422.

+LIBYUV_API

+int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*YUY2ToUV422Row)(const uint8* src_yuy2,

+                         uint8* dst_u, uint8* dst_v, int pix) =

+      YUY2ToUV422Row_C;

+  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =

+      YUY2ToYRow_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;

+    src_stride_yuy2 = -src_stride_yuy2;

+  }

+  // Coalesce rows.

+  if (src_stride_yuy2 == width * 2 &&

+      dst_stride_y == width &&

+      dst_stride_u * 2 == width &&

+      dst_stride_v * 2 == width) {

+    width *= height;

+    height = 1;

+    src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;

+  }

+#if defined(HAS_YUY2TOYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;

+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;

+      YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {

+        YUY2ToUV422Row = YUY2ToUV422Row_SSE2;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          YUY2ToYRow = YUY2ToYRow_SSE2;

+        }

+      }

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;

+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      YUY2ToUV422Row = YUY2ToUV422Row_AVX2;

+      YUY2ToYRow = YUY2ToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    YUY2ToYRow = YUY2ToYRow_Any_NEON;

+    if (width >= 16) {

+      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;

+    }

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToYRow = YUY2ToYRow_NEON;

+      YUY2ToUV422Row = YUY2ToUV422Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);

+    YUY2ToYRow(src_yuy2, dst_y, width);

+    src_yuy2 += src_stride_yuy2;

+    dst_y += dst_stride_y;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  return 0;

+}

+// Convert UYVY to I422.

+LIBYUV_API

+int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int y;

+  void (*UYVYToUV422Row)(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix) =

+      UYVYToUV422Row_C;

+  void (*UYVYToYRow)(const uint8* src_uyvy,

+                     uint8* dst_y, int pix) = UYVYToYRow_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;

+    src_stride_uyvy = -src_stride_uyvy;

+  }

+  // Coalesce rows.

+  if (src_stride_uyvy == width * 2 &&

+      dst_stride_y == width &&

+      dst_stride_u * 2 == width &&

+      dst_stride_v * 2 == width) {

+    width *= height;

+    height = 1;

+    src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;

+  }

+#if defined(HAS_UYVYTOYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {

+    UYVYToUV422Row = UYVYToUV422Row_Any_SSE2;

+    UYVYToYRow = UYVYToYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2;

+      UYVYToYRow = UYVYToYRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {

+        UYVYToUV422Row = UYVYToUV422Row_SSE2;

+        if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+          UYVYToYRow = UYVYToYRow_SSE2;

+        }

+      }

+    }

+  }

+#endif

+#if defined(HAS_UYVYTOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {

+    UYVYToUV422Row = UYVYToUV422Row_Any_AVX2;

+    UYVYToYRow = UYVYToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      UYVYToUV422Row = UYVYToUV422Row_AVX2;

+      UYVYToYRow = UYVYToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_UYVYTOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    UYVYToYRow = UYVYToYRow_Any_NEON;

+    if (width >= 16) {

+      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;

+    }

+    if (IS_ALIGNED(width, 16)) {

+      UYVYToYRow = UYVYToYRow_NEON;

+      UYVYToUV422Row = UYVYToUV422Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);

+    UYVYToYRow(src_uyvy, dst_y, width);

+    src_uyvy += src_stride_uyvy;

+    dst_y += dst_stride_y;

+    dst_u += dst_stride_u;

+    dst_v += dst_stride_v;

+  }

+  return 0;

+}

+// Mirror I400 with optional flipping

+LIBYUV_API

+int I400Mirror(const uint8* src_y, int src_stride_y,

+               uint8* dst_y, int dst_stride_y,

+               int width, int height) {

+  if (!src_y || !dst_y ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  return 0;

+}

+// Mirror I420 with optional flipping

+LIBYUV_API

+int I420Mirror(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_y, int dst_stride_y,

+               uint8* dst_u, int dst_stride_u,

+               uint8* dst_v, int dst_stride_v,

+               int width, int height) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (halfheight - 1) * src_stride_u;

+    src_v = src_v + (halfheight - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  if (dst_y) {

+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  }

+  MirrorPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);

+  MirrorPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);

+  return 0;

+}

+// ARGB mirror.

+LIBYUV_API

+int ARGBMirror(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =

+      ARGBMirrorRow_C;

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+#if defined(HAS_ARGBMIRRORROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBMirrorRow = ARGBMirrorRow_SSSE3;

+  }

+#endif

+#if defined(HAS_ARGBMIRRORROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {

+    ARGBMirrorRow = ARGBMirrorRow_AVX2;

+  }

+#endif

+#if defined(HAS_ARGBMIRRORROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {

+    ARGBMirrorRow = ARGBMirrorRow_NEON;

+  }

+#endif

+  // Mirror plane

+  for (y = 0; y < height; ++y) {

+    ARGBMirrorRow(src_argb, dst_argb, width);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Get a blender that optimized for the CPU, alignment and pixel count.

+// As there are 6 blenders to choose from, the caller should try to use

+// the same blend function for all pixels if possible.

+LIBYUV_API

+ARGBBlendRow GetARGBBlend() {

+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,

+                       uint8* dst_argb, int width) = ARGBBlendRow_C;

+#if defined(HAS_ARGBBLENDROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBBlendRow = ARGBBlendRow_SSSE3;

+    return ARGBBlendRow;

+  }

+#endif

+#if defined(HAS_ARGBBLENDROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ARGBBlendRow = ARGBBlendRow_SSE2;

+  }

+#endif

+#if defined(HAS_ARGBBLENDROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ARGBBlendRow = ARGBBlendRow_NEON;

+  }

+#endif

+  return ARGBBlendRow;

+}

+// Alpha Blend 2 ARGB images and store to destination.

+LIBYUV_API

+int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,

+              const uint8* src_argb1, int src_stride_argb1,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height) {

+  int y;

+  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,

+                       uint8* dst_argb, int width) = GetARGBBlend();

+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb0 == width * 4 &&

+      src_stride_argb1 == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;

+  }

+  for (y = 0; y < height; ++y) {

+    ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);

+    src_argb0 += src_stride_argb0;

+    src_argb1 += src_stride_argb1;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Multiply 2 ARGB images and store to destination.

+LIBYUV_API

+int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,

+                 const uint8* src_argb1, int src_stride_argb1,

+                 uint8* dst_argb, int dst_stride_argb,

+                 int width, int height) {

+  int y;

+  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,

+                          int width) = ARGBMultiplyRow_C;

+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb0 == width * 4 &&

+      src_stride_argb1 == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBMULTIPLYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {

+    ARGBMultiplyRow = ARGBMultiplyRow_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBMultiplyRow = ARGBMultiplyRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBMULTIPLYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {

+    ARGBMultiplyRow = ARGBMultiplyRow_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBMultiplyRow = ARGBMultiplyRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBMULTIPLYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBMultiplyRow = ARGBMultiplyRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBMultiplyRow = ARGBMultiplyRow_NEON;

+    }

+  }

+#endif

+  // Multiply plane

+  for (y = 0; y < height; ++y) {

+    ARGBMultiplyRow(src_argb0, src_argb1, dst_argb, width);

+    src_argb0 += src_stride_argb0;

+    src_argb1 += src_stride_argb1;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Add 2 ARGB images and store to destination.

+LIBYUV_API

+int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,

+            const uint8* src_argb1, int src_stride_argb1,

+            uint8* dst_argb, int dst_stride_argb,

+            int width, int height) {

+  int y;

+  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,

+                     int width) = ARGBAddRow_C;

+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb0 == width * 4 &&

+      src_stride_argb1 == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ARGBAddRow = ARGBAddRow_SSE2;

+  }

+#endif

+#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {

+    ARGBAddRow = ARGBAddRow_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBAddRow = ARGBAddRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBADDROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {

+    ARGBAddRow = ARGBAddRow_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAddRow = ARGBAddRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBADDROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBAddRow = ARGBAddRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAddRow = ARGBAddRow_NEON;

+    }

+  }

+#endif

+  // Add plane

+  for (y = 0; y < height; ++y) {

+    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);

+    src_argb0 += src_stride_argb0;

+    src_argb1 += src_stride_argb1;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Subtract 2 ARGB images and store to destination.

+LIBYUV_API

+int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,

+                 const uint8* src_argb1, int src_stride_argb1,

+                 uint8* dst_argb, int dst_stride_argb,

+                 int width, int height) {

+  int y;

+  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,

+                          int width) = ARGBSubtractRow_C;

+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb0 == width * 4 &&

+      src_stride_argb1 == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBSUBTRACTROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {

+    ARGBSubtractRow = ARGBSubtractRow_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBSubtractRow = ARGBSubtractRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBSUBTRACTROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {

+    ARGBSubtractRow = ARGBSubtractRow_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBSubtractRow = ARGBSubtractRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBSUBTRACTROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBSubtractRow = ARGBSubtractRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBSubtractRow = ARGBSubtractRow_NEON;

+    }

+  }

+#endif

+  // Subtract plane

+  for (y = 0; y < height; ++y) {

+    ARGBSubtractRow(src_argb0, src_argb1, dst_argb, width);

+    src_argb0 += src_stride_argb0;

+    src_argb1 += src_stride_argb1;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert I422 to BGRA.

+LIBYUV_API

+int I422ToBGRA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_bgra, int dst_stride_bgra,

+               int width, int height) {

+  int y;

+  void (*I422ToBGRARow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToBGRARow_C;

+  if (!src_y || !src_u || !src_v ||

+      !dst_bgra ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;

+    dst_stride_bgra = -dst_stride_bgra;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u * 2 == width &&

+      src_stride_v * 2 == width &&

+      dst_stride_bgra == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_bgra = 0;

+  }

+#if defined(HAS_I422TOBGRAROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    I422ToBGRARow = I422ToBGRARow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToBGRARow = I422ToBGRARow_NEON;

+    }

+  }

+#elif defined(HAS_I422TOBGRAROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToBGRARow = I422ToBGRARow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {

+        I422ToBGRARow = I422ToBGRARow_SSSE3;

+      }

+    }

+  }

+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

+      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

+      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

+      IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {

+    I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);

+    dst_bgra += dst_stride_bgra;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

+// Convert I422 to ABGR.

+LIBYUV_API

+int I422ToABGR(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_abgr, int dst_stride_abgr,

+               int width, int height) {

+  int y;

+  void (*I422ToABGRRow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToABGRRow_C;

+  if (!src_y || !src_u || !src_v ||

+      !dst_abgr ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;

+    dst_stride_abgr = -dst_stride_abgr;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u * 2 == width &&

+      src_stride_v * 2 == width &&

+      dst_stride_abgr == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_abgr = 0;

+  }

+#if defined(HAS_I422TOABGRROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    I422ToABGRRow = I422ToABGRRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToABGRRow = I422ToABGRRow_NEON;

+    }

+  }

+#elif defined(HAS_I422TOABGRROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToABGRRow = I422ToABGRRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {

+        I422ToABGRRow = I422ToABGRRow_SSSE3;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);

+    dst_abgr += dst_stride_abgr;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

+// Convert I422 to RGBA.

+LIBYUV_API

+int I422ToRGBA(const uint8* src_y, int src_stride_y,

+               const uint8* src_u, int src_stride_u,

+               const uint8* src_v, int src_stride_v,

+               uint8* dst_rgba, int dst_stride_rgba,

+               int width, int height) {

+  int y;

+  void (*I422ToRGBARow)(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* rgb_buf,

+                        int width) = I422ToRGBARow_C;

+  if (!src_y || !src_u || !src_v ||

+      !dst_rgba ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;

+    dst_stride_rgba = -dst_stride_rgba;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      src_stride_u * 2 == width &&

+      src_stride_v * 2 == width &&

+      dst_stride_rgba == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = src_stride_u = src_stride_v = dst_stride_rgba = 0;

+  }

+#if defined(HAS_I422TORGBAROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    I422ToRGBARow = I422ToRGBARow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToRGBARow = I422ToRGBARow_NEON;

+    }

+  }

+#elif defined(HAS_I422TORGBAROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    I422ToRGBARow = I422ToRGBARow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {

+        I422ToRGBARow = I422ToRGBARow_SSSE3;

+      }

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);

+    dst_rgba += dst_stride_rgba;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

+// Convert NV12 to RGB565.

+LIBYUV_API

+int NV12ToRGB565(const uint8* src_y, int src_stride_y,

+                 const uint8* src_uv, int src_stride_uv,

+                 uint8* dst_rgb565, int dst_stride_rgb565,

+                 int width, int height) {

+  int y;

+  void (*NV12ToRGB565Row)(const uint8* y_buf,

+                          const uint8* uv_buf,

+                          uint8* rgb_buf,

+                          int width) = NV12ToRGB565Row_C;

+  if (!src_y || !src_uv || !dst_rgb565 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;

+    dst_stride_rgb565 = -dst_stride_rgb565;

+  }

+#if defined(HAS_NV12TORGB565ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToRGB565Row = NV12ToRGB565Row_SSSE3;

+    }

+  }

+#elif defined(HAS_NV12TORGB565ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToRGB565Row = NV12ToRGB565Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width);

+    dst_rgb565 += dst_stride_rgb565;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_uv += src_stride_uv;

+    }

+  }

+  return 0;

+}

+// Convert NV21 to RGB565.

+LIBYUV_API

+int NV21ToRGB565(const uint8* src_y, int src_stride_y,

+                 const uint8* src_vu, int src_stride_vu,

+                 uint8* dst_rgb565, int dst_stride_rgb565,

+                 int width, int height) {

+  int y;

+  void (*NV21ToRGB565Row)(const uint8* y_buf,

+                          const uint8* src_vu,

+                          uint8* rgb_buf,

+                          int width) = NV21ToRGB565Row_C;

+  if (!src_y || !src_vu || !dst_rgb565 ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;

+    dst_stride_rgb565 = -dst_stride_rgb565;

+  }

+#if defined(HAS_NV21TORGB565ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      NV21ToRGB565Row = NV21ToRGB565Row_SSSE3;

+    }

+  }

+#elif defined(HAS_NV21TORGB565ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      NV21ToRGB565Row = NV21ToRGB565Row_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width);

+    dst_rgb565 += dst_stride_rgb565;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_vu += src_stride_vu;

+    }

+  }

+  return 0;

+}

+LIBYUV_API

+void SetPlane(uint8* dst_y, int dst_stride_y,

+              int width, int height,

+              uint32 value) {

+  int y;

+  uint32 v32 = value | (value << 8) | (value << 16) | (value << 24);

+  void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C;

+  // Coalesce rows.

+  if (dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    dst_stride_y = 0;

+  }

+#if defined(HAS_SETROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) &&

+      IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+    SetRow = SetRow_NEON;

+  }

+#endif

+#if defined(HAS_SETROW_X86)

+  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {

+    SetRow = SetRow_X86;

+  }

+#endif

+  // Set plane

+  for (y = 0; y < height; ++y) {

+    SetRow(dst_y, v32, width);

+    dst_y += dst_stride_y;

+  }

+}

+// Draw a rectangle into I420

+LIBYUV_API

+int I420Rect(uint8* dst_y, int dst_stride_y,

+             uint8* dst_u, int dst_stride_u,

+             uint8* dst_v, int dst_stride_v,

+             int x, int y,

+             int width, int height,

+             int value_y, int value_u, int value_v) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  uint8* start_y = dst_y + y * dst_stride_y + x;

+  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);

+  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);

+  if (!dst_y || !dst_u || !dst_v ||

+      width <= 0 || height <= 0 ||

+      x < 0 || y < 0 ||

+      value_y < 0 || value_y > 255 ||

+      value_u < 0 || value_u > 255 ||

+      value_v < 0 || value_v > 255) {

+    return -1;

+  }

+  SetPlane(start_y, dst_stride_y, width, height, value_y);

+  SetPlane(start_u, dst_stride_u, halfwidth, halfheight, value_u);

+  SetPlane(start_v, dst_stride_v, halfwidth, halfheight, value_v);

+  return 0;

+}

+// Draw a rectangle into ARGB

+LIBYUV_API

+int ARGBRect(uint8* dst_argb, int dst_stride_argb,

+             int dst_x, int dst_y,

+             int width, int height,

+             uint32 value) {

+  if (!dst_argb ||

+      width <= 0 || height <= 0 ||

+      dst_x < 0 || dst_y < 0) {

+    return -1;

+  }

+  dst_argb += dst_y * dst_stride_argb + dst_x * 4;

+  // Coalesce rows.

+  if (dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    dst_stride_argb = 0;

+  }

+#if defined(HAS_SETROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height);

+    return 0;

+  }

+#endif

+#if defined(HAS_SETROW_X86)

+  if (TestCpuFlag(kCpuHasX86)) {

+    ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height);

+    return 0;

+  }

+#endif

+  ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height);

+  return 0;

+}

+// Convert unattentuated ARGB to preattenuated ARGB.

+// An unattenutated ARGB alpha blend uses the formula

+// p = a * f + (1 - a) * b

+// where

+//   p is output pixel

+//   f is foreground pixel

+//   b is background pixel

+//   a is alpha value from foreground pixel

+// An preattenutated ARGB alpha blend uses the formula

+// p = f + (1 - a) * b

+// where

+//   f is foreground pixel premultiplied by alpha

+LIBYUV_API

+int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,

+                  uint8* dst_argb, int dst_stride_argb,

+                  int width, int height) {

+  int y;

+  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,

+                           int width) = ARGBAttenuateRow_C;

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBATTENUATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBATTENUATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBATTENUATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBATTENUATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBAttenuateRow(src_argb, dst_argb, width);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert preattentuated ARGB to unattenuated ARGB.

+LIBYUV_API

+int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height) {

+  int y;

+  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,

+                             int width) = ARGBUnattenuateRow_C;

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBUNATTENUATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {

+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBUNATTENUATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {

+    ARGBUnattenuateRow = ARGBUnattenuateRow_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;

+    }

+  }

+#endif

+// TODO(fbarchard): Neon version.

+  for (y = 0; y < height; ++y) {

+    ARGBUnattenuateRow(src_argb, dst_argb, width);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert ARGB to Grayed ARGB.

+LIBYUV_API

+int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,

+               uint8* dst_argb, int dst_stride_argb,

+               int width, int height) {

+  int y;

+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,

+                      int width) = ARGBGrayRow_C;

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBGRAYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBGrayRow = ARGBGrayRow_SSSE3;

+  }

+#elif defined(HAS_ARGBGRAYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    ARGBGrayRow = ARGBGrayRow_NEON;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBGrayRow(src_argb, dst_argb, width);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Make a rectangle of ARGB gray scale.

+LIBYUV_API

+int ARGBGray(uint8* dst_argb, int dst_stride_argb,

+             int dst_x, int dst_y,

+             int width, int height) {

+  int y;

+  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,

+                      int width) = ARGBGrayRow_C;

+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {

+    return -1;

+  }

+  // Coalesce rows.

+  if (dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBGRAYROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBGrayRow = ARGBGrayRow_SSSE3;

+  }

+#elif defined(HAS_ARGBGRAYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    ARGBGrayRow = ARGBGrayRow_NEON;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBGrayRow(dst, dst, width);

+    dst += dst_stride_argb;

+  }

+  return 0;

+}

+// Make a rectangle of ARGB Sepia tone.

+LIBYUV_API

+int ARGBSepia(uint8* dst_argb, int dst_stride_argb,

+              int dst_x, int dst_y, int width, int height) {

+  int y;

+  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;

+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {

+    return -1;

+  }

+  // Coalesce rows.

+  if (dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBSEPIAROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBSepiaRow = ARGBSepiaRow_SSSE3;

+  }

+#elif defined(HAS_ARGBSEPIAROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    ARGBSepiaRow = ARGBSepiaRow_NEON;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBSepiaRow(dst, width);

+    dst += dst_stride_argb;

+  }

+  return 0;

+}

+// Apply a 4x4 matrix to each ARGB pixel.

+// Note: Normally for shading, but can be used to swizzle or invert.

+LIBYUV_API

+int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,

+                    uint8* dst_argb, int dst_stride_argb,

+                    const int8* matrix_argb,

+                    int width, int height) {

+  int y;

+  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,

+      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;

+  if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBCOLORMATRIXROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBColorMatrixRow = ARGBColorMatrixRow_SSSE3;

+  }

+#elif defined(HAS_ARGBCOLORMATRIXROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Apply a 4x3 matrix to each ARGB pixel.

+// Deprecated.

+LIBYUV_API

+int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,

+                   const int8* matrix_rgb,

+                   int dst_x, int dst_y, int width, int height) {

+  SIMD_ALIGNED(int8 matrix_argb[16]);

+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||

+      dst_x < 0 || dst_y < 0) {

+    return -1;

+  }

+  // Convert 4x3 7 bit matrix to 4x4 6 bit matrix.

+  matrix_argb[0] = matrix_rgb[0] / 2;

+  matrix_argb[1] = matrix_rgb[1] / 2;

+  matrix_argb[2] = matrix_rgb[2] / 2;

+  matrix_argb[3] = matrix_rgb[3] / 2;

+  matrix_argb[4] = matrix_rgb[4] / 2;

+  matrix_argb[5] = matrix_rgb[5] / 2;

+  matrix_argb[6] = matrix_rgb[6] / 2;

+  matrix_argb[7] = matrix_rgb[7] / 2;

+  matrix_argb[8] = matrix_rgb[8] / 2;

+  matrix_argb[9] = matrix_rgb[9] / 2;

+  matrix_argb[10] = matrix_rgb[10] / 2;

+  matrix_argb[11] = matrix_rgb[11] / 2;

+  matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;

+  matrix_argb[15] = 64;  // 1.0

+  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,

+                         dst, dst_stride_argb,

+                         &matrix_argb[0], width, height);

+}

+// Apply a color table each ARGB pixel.

+// Table contains 256 ARGB values.

+LIBYUV_API

+int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,

+                   const uint8* table_argb,

+                   int dst_x, int dst_y, int width, int height) {

+  int y;

+  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,

+                            int width) = ARGBColorTableRow_C;

+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||

+      dst_x < 0 || dst_y < 0) {

+    return -1;

+  }

+  // Coalesce rows.

+  if (dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBCOLORTABLEROW_X86)

+  if (TestCpuFlag(kCpuHasX86)) {

+    ARGBColorTableRow = ARGBColorTableRow_X86;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBColorTableRow(dst, table_argb, width);

+    dst += dst_stride_argb;

+  }

+  return 0;

+}

+// Apply a color table each ARGB pixel but preserve destination alpha.

+// Table contains 256 ARGB values.

+LIBYUV_API

+int RGBColorTable(uint8* dst_argb, int dst_stride_argb,

+                  const uint8* table_argb,

+                  int dst_x, int dst_y, int width, int height) {

+  int y;

+  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,

+                           int width) = RGBColorTableRow_C;

+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||

+      dst_x < 0 || dst_y < 0) {

+    return -1;

+  }

+  // Coalesce rows.

+  if (dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    dst_stride_argb = 0;

+  }

+#if defined(HAS_RGBCOLORTABLEROW_X86)

+  if (TestCpuFlag(kCpuHasX86)) {

+    RGBColorTableRow = RGBColorTableRow_X86;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    RGBColorTableRow(dst, table_argb, width);

+    dst += dst_stride_argb;

+  }

+  return 0;

+}

+// ARGBQuantize is used to posterize art.

+// e.g. rgb / qvalue * qvalue + qvalue / 2

+// But the low levels implement efficiently with 3 parameters, and could be

+// used for other high level operations.

+// dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;

+// where scale is 1 / interval_size as a fixed point value.

+// The divide is replaces with a multiply by reciprocal fixed point multiply.

+// Caveat - although SSE2 saturates, the C function does not and should be used

+// with care if doing anything but quantization.

+LIBYUV_API

+int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,

+                 int scale, int interval_size, int interval_offset,

+                 int dst_x, int dst_y, int width, int height) {

+  int y;

+  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,

+                          int interval_offset, int width) = ARGBQuantizeRow_C;

+  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||

+      interval_size < 1 || interval_size > 255) {

+    return -1;

+  }

+  // Coalesce rows.

+  if (dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBQUANTIZEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBQuantizeRow = ARGBQuantizeRow_SSE2;

+  }

+#elif defined(HAS_ARGBQUANTIZEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    ARGBQuantizeRow = ARGBQuantizeRow_NEON;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);

+    dst += dst_stride_argb;

+  }

+  return 0;

+}

+// Computes table of cumulative sum for image where the value is the sum

+// of all values above and to the left of the entry. Used by ARGBBlur.

+LIBYUV_API

+int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,

+                             int32* dst_cumsum, int dst_stride32_cumsum,

+                             int width, int height) {

+  int y;

+  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,

+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;

+  int32* previous_cumsum = dst_cumsum;

+  if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {

+    return -1;

+  }

+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;

+  }

+#endif

+  memset(dst_cumsum, 0, width * sizeof(dst_cumsum[0]) * 4);  // 4 int per pixel.

+  for (y = 0; y < height; ++y) {

+    ComputeCumulativeSumRow(src_argb, dst_cumsum, previous_cumsum, width);

+    previous_cumsum = dst_cumsum;

+    dst_cumsum += dst_stride32_cumsum;

+    src_argb += src_stride_argb;

+  }

+  return 0;

+}

+// Blur ARGB image.

+// Caller should allocate CumulativeSum table of width * height * 16 bytes

+// aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory

+// as the buffer is treated as circular.

+LIBYUV_API

+int ARGBBlur(const uint8* src_argb, int src_stride_argb,

+             uint8* dst_argb, int dst_stride_argb,

+             int32* dst_cumsum, int dst_stride32_cumsum,

+             int width, int height, int radius) {

+  int y;

+  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,

+      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;

+  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,

+      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;

+  int32* cumsum_bot_row;

+  int32* max_cumsum_bot_row;

+  int32* cumsum_top_row;

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  if (radius > height) {

+    radius = height;

+  }

+  if (radius > (width / 2 - 1)) {

+    radius = width / 2 - 1;

+  }

+  if (radius <= 0) {

+    return -1;

+  }

+#if defined(HAS_CUMULATIVESUMTOAVERAGEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ComputeCumulativeSumRow = ComputeCumulativeSumRow_SSE2;

+    CumulativeSumToAverageRow = CumulativeSumToAverageRow_SSE2;

+  }

+#endif

+  // Compute enough CumulativeSum for first row to be blurred. After this

+  // one row of CumulativeSum is updated at a time.

+  ARGBComputeCumulativeSum(src_argb, src_stride_argb,

+                           dst_cumsum, dst_stride32_cumsum,

+                           width, radius);

+  src_argb = src_argb + radius * src_stride_argb;

+  cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];

+  max_cumsum_bot_row = &dst_cumsum[(radius * 2 + 2) * dst_stride32_cumsum];

+  cumsum_top_row = &dst_cumsum[0];

+  for (y = 0; y < height; ++y) {

+    int top_y = ((y - radius - 1) >= 0) ? (y - radius - 1) : 0;

+    int bot_y = ((y + radius) < height) ? (y + radius) : (height - 1);

+    int area = radius * (bot_y - top_y);

+    int boxwidth = radius * 4;

+    int x;

+    int n;

+    // Increment cumsum_top_row pointer with circular buffer wrap around.

+    if (top_y) {

+      cumsum_top_row += dst_stride32_cumsum;

+      if (cumsum_top_row >= max_cumsum_bot_row) {

+        cumsum_top_row = dst_cumsum;

+      }

+    }

+    // Increment cumsum_bot_row pointer with circular buffer wrap around and

+    // then fill in a row of CumulativeSum.

+    if ((y + radius) < height) {

+      const int32* prev_cumsum_bot_row = cumsum_bot_row;

+      cumsum_bot_row += dst_stride32_cumsum;

+      if (cumsum_bot_row >= max_cumsum_bot_row) {

+        cumsum_bot_row = dst_cumsum;

+      }

+      ComputeCumulativeSumRow(src_argb, cumsum_bot_row, prev_cumsum_bot_row,

+                              width);

+      src_argb += src_stride_argb;

+    }

+    // Left clipped.

+    for (x = 0; x < radius + 1; ++x) {

+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,

+                                boxwidth, area, &dst_argb[x * 4], 1);

+      area += (bot_y - top_y);

+      boxwidth += 4;

+    }

+    // Middle unclipped.

+    n = (width - 1) - radius - x + 1;

+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,

+                              boxwidth, area, &dst_argb[x * 4], n);

+    // Right clipped.

+    for (x += n; x <= width - 1; ++x) {

+      area -= (bot_y - top_y);

+      boxwidth -= 4;

+      CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,

+                                cumsum_bot_row + (x - radius - 1) * 4,

+                                boxwidth, area, &dst_argb[x * 4], 1);

+    }

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Multiply ARGB image by a specified ARGB value.

+LIBYUV_API

+int ARGBShade(const uint8* src_argb, int src_stride_argb,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height, uint32 value) {

+  int y;

+  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,

+                       int width, uint32 value) = ARGBShadeRow_C;

+  if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBSHADEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    ARGBShadeRow = ARGBShadeRow_SSE2;

+  }

+#elif defined(HAS_ARGBSHADEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    ARGBShadeRow = ARGBShadeRow_NEON;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBShadeRow(src_argb, dst_argb, width, value);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Interpolate 2 ARGB images by specified amount (0 to 255).

+LIBYUV_API

+int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,

+                    const uint8* src_argb1, int src_stride_argb1,

+                    uint8* dst_argb, int dst_stride_argb,

+                    int width, int height, int interpolation) {

+  int y;

+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_C;

+  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb0 == width * 4 &&

+      src_stride_argb1 == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0;

+  }

+#if defined(HAS_INTERPOLATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {

+    InterpolateRow = InterpolateRow_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&

+          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        InterpolateRow = InterpolateRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {

+    InterpolateRow = InterpolateRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 4)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&

+          IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        InterpolateRow = InterpolateRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 8) {

+    InterpolateRow = InterpolateRow_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      InterpolateRow = InterpolateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {

+    InterpolateRow = InterpolateRow_Any_NEON;

+    if (IS_ALIGNED(width, 4)) {

+      InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && width >= 1 &&

+      IS_ALIGNED(src_argb0, 4) && IS_ALIGNED(src_stride_argb0, 4) &&

+      IS_ALIGNED(src_argb1, 4) && IS_ALIGNED(src_stride_argb1, 4) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

+    ScaleARGBFilterRows = InterpolateRow_MIPS_DSPR2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    InterpolateRow(dst_argb, src_argb0, src_argb1 - src_argb0,

+                   width * 4, interpolation);

+    src_argb0 += src_stride_argb0;

+    src_argb1 += src_stride_argb1;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Shuffle ARGB channel order.  e.g. BGRA to ARGB.

+LIBYUV_API

+int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,

+                uint8* dst_argb, int dst_stride_argb,

+                const uint8* shuffler, int width, int height) {

+  int y;

+  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,

+                         const uint8* shuffler, int pix) = ARGBShuffleRow_C;

+  if (!src_bgra || !dst_argb ||

+      width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_bgra = src_bgra + (height - 1) * src_stride_bgra;

+    src_stride_bgra = -src_stride_bgra;

+  }

+  // Coalesce rows.

+  if (src_stride_bgra == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_bgra = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBSHUFFLEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4) {

+    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBShuffleRow = ARGBShuffleRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBSHUFFLEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {

+    ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBShuffleRow = ARGBShuffleRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+        ARGBShuffleRow = ARGBShuffleRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_ARGBSHUFFLEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {

+    ARGBShuffleRow = ARGBShuffleRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBShuffleRow = ARGBShuffleRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBSHUFFLEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 4) {

+    ARGBShuffleRow = ARGBShuffleRow_Any_NEON;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBShuffleRow = ARGBShuffleRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);

+    src_bgra += src_stride_bgra;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Sobel ARGB effect.

+static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,

+                        uint8* dst_argb, int dst_stride_argb,

+                        int width, int height,

+                        void (*SobelRow)(const uint8* src_sobelx,

+                                         const uint8* src_sobely,

+                                         uint8* dst, int width)) {

+  int y;

+  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,

+                         uint32 selector, int pix) = ARGBToBayerGGRow_C;

+  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,

+                    uint8* dst_sobely, int width) = SobelYRow_C;

+  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,

+                    const uint8* src_y2, uint8* dst_sobely, int width) =

+      SobelXRow_C;

+  const int kEdge = 16;  // Extra pixels at start of row for extrude/align.

+  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb  = src_argb  + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // ARGBToBayer used to select G channel from ARGB.

+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+    ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToBayerRow = ARGBToBayerGGRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOBAYERROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {

+    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToBayerRow = ARGBToBayerRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOBAYERGGROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {

+    ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToBayerRow = ARGBToBayerGGRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_SOBELYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    SobelYRow = SobelYRow_SSE2;

+  }

+#endif

+#if defined(HAS_SOBELYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    SobelYRow = SobelYRow_NEON;

+  }

+#endif

+#if defined(HAS_SOBELXROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    SobelXRow = SobelXRow_SSE2;

+  }

+#endif

+#if defined(HAS_SOBELXROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    SobelXRow = SobelXRow_NEON;

+  }

+#endif

+  {

+    // 3 rows with edges before/after.

+    const int kRowSize = (width + kEdge + 15) & ~15;

+    align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));

+    uint8* row_sobelx = rows;

+    uint8* row_sobely = rows + kRowSize;

+    uint8* row_y = rows + kRowSize * 2;

+    // Convert first row.

+    uint8* row_y0 = row_y + kEdge;

+    uint8* row_y1 = row_y0 + kRowSize;

+    uint8* row_y2 = row_y1 + kRowSize;

+    ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);

+    row_y0[-1] = row_y0[0];

+    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.

+    ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);

+    row_y1[-1] = row_y1[0];

+    memset(row_y1 + width, row_y1[width - 1], 16);

+    memset(row_y2 + width, 0, 16);

+    for (y = 0; y < height; ++y) {

+      // Convert next row of ARGB to Y.

+      if (y < (height - 1)) {

+        src_argb += src_stride_argb;

+      }

+      ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);

+      row_y2[-1] = row_y2[0];

+      row_y2[width] = row_y2[width - 1];

+      SobelXRow(row_y0 - 1, row_y1 - 1, row_y2 - 1, row_sobelx, width);

+      SobelYRow(row_y0 - 1, row_y2 - 1, row_sobely, width);

+      SobelRow(row_sobelx, row_sobely, dst_argb, width);

+      // Cycle thru circular queue of 3 row_y buffers.

+      {

+        uint8* row_yt = row_y0;

+        row_y0 = row_y1;

+        row_y1 = row_y2;

+        row_y2 = row_yt;

+      }

+      dst_argb += dst_stride_argb;

+    }

+    free_aligned_buffer_64(rows);

+  }

+  return 0;

+}

+// Sobel ARGB effect.

+LIBYUV_API

+int ARGBSobel(const uint8* src_argb, int src_stride_argb,

+              uint8* dst_argb, int dst_stride_argb,

+              int width, int height) {

+  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,

+                   uint8* dst_argb, int width) = SobelRow_C;

+#if defined(HAS_SOBELROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    SobelRow = SobelRow_SSE2;

+  }

+#endif

+#if defined(HAS_SOBELROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    SobelRow = SobelRow_NEON;

+  }

+#endif

+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,

+                      width, height, SobelRow);

+}

+// Sobel ARGB effect with planar output.

+LIBYUV_API

+int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,

+                     uint8* dst_y, int dst_stride_y,

+                     int width, int height) {

+  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,

+                          uint8* dst_, int width) = SobelToPlaneRow_C;

+#if defined(HAS_SOBELTOPLANEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {

+    SobelToPlaneRow = SobelToPlaneRow_SSE2;

+  }

+#endif

+#if defined(HAS_SOBELTOPLANEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {

+    SobelToPlaneRow = SobelToPlaneRow_NEON;

+  }

+#endif

+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,

+                      width, height, SobelToPlaneRow);

+}

+// SobelXY ARGB effect.

+// Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.

+LIBYUV_API

+int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,

+                uint8* dst_argb, int dst_stride_argb,

+                int width, int height) {

+  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width) = SobelXYRow_C;

+#if defined(HAS_SOBELXYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {

+    SobelXYRow = SobelXYRow_SSE2;

+  }

+#endif

+#if defined(HAS_SOBELXYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

+    SobelXYRow = SobelXYRow_NEON;

+  }

+#endif

+  return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,

+                      width, height, SobelXYRow);

+}

+// Apply a 4x4 polynomial to each ARGB pixel.

+LIBYUV_API

+int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,

+                   uint8* dst_argb, int dst_stride_argb,

+                   const float* poly,

+                   int width, int height) {

+  int y;

+  void (*ARGBPolynomialRow)(const uint8* src_argb,

+                            uint8* dst_argb, const float* poly,

+                            int width) = ARGBPolynomialRow_C;

+  if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb  = src_argb  + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBPOLYNOMIALROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) {

+    ARGBPolynomialRow = ARGBPolynomialRow_SSE2;

+  }

+#endif

+#if defined(HAS_ARGBPOLYNOMIALROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasFMA3) &&

+      IS_ALIGNED(width, 2)) {

+    ARGBPolynomialRow = ARGBPolynomialRow_AVX2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBPolynomialRow(src_argb, dst_argb, poly, width);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Apply a lumacolortable to each ARGB pixel.

+LIBYUV_API

+int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,

+                       uint8* dst_argb, int dst_stride_argb,

+                       const uint8* luma,

+                       int width, int height) {

+  int y;

+  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,

+      int width, const uint8* luma, const uint32 lumacoeff) =

+      ARGBLumaColorTableRow_C;

+  if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb  = src_argb  + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) {

+    ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Copy Alpha from one ARGB image to another.

+LIBYUV_API

+int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,

+                  uint8* dst_argb, int dst_stride_argb,

+                  int width, int height) {

+  int y;

+  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =

+      ARGBCopyAlphaRow_C;

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBCOPYALPHAROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) &&

+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&

+      IS_ALIGNED(width, 8)) {

+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_SSE2;

+  }

+#endif

+#if defined(HAS_ARGBCOPYALPHAROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {

+    ARGBCopyAlphaRow = ARGBCopyAlphaRow_AVX2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBCopyAlphaRow(src_argb, dst_argb, width);

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Copy a planar Y channel to the alpha channel of a destination ARGB image.

+LIBYUV_API

+int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,

+                     uint8* dst_argb, int dst_stride_argb,

+                     int width, int height) {

+  int y;

+  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =

+      ARGBCopyYToAlphaRow_C;

+  if (!src_y || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width &&

+      dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_argb = 0;

+  }

+#if defined(HAS_ARGBCOPYYTOALPHAROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) &&

+      IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&

+      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16) &&

+      IS_ALIGNED(width, 8)) {

+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_SSE2;

+  }

+#endif

+#if defined(HAS_ARGBCOPYYTOALPHAROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 16)) {

+    ARGBCopyYToAlphaRow = ARGBCopyYToAlphaRow_AVX2;

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBCopyYToAlphaRow(src_y, dst_argb, width);

+    src_y += src_stride_y;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/third_party/libyuv/source/row.h

+++ /dev/null

@@ -1,264 +1,0 @@

-/*

- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef LIBYUV_SOURCE_ROW_H_

-#define LIBYUV_SOURCE_ROW_H_

-#include "third_party/libyuv/include/libyuv/basic_types.h"

-#define kMaxStride (2048 * 4)

-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))

-#if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR)

-#define YUV_DISABLE_ASM

-#endif

-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)

-#define HAS_FASTCONVERTYUVTOARGBROW_NEON

-void FastConvertYUVToARGBRow_NEON(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  uint8* rgb_buf,

-                                  int width);

-#define HAS_FASTCONVERTYUVTOBGRAROW_NEON

-void FastConvertYUVToBGRARow_NEON(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  uint8* rgb_buf,

-                                  int width);

-#define HAS_FASTCONVERTYUVTOABGRROW_NEON

-void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  uint8* rgb_buf,

-                                  int width);

-#endif

-// The following are available on all x86 platforms

-#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \

-    !defined(YUV_DISABLE_ASM)

-#define HAS_ABGRTOARGBROW_SSSE3

-#define HAS_BGRATOARGBROW_SSSE3

-#define HAS_BG24TOARGBROW_SSSE3

-#define HAS_RAWTOARGBROW_SSSE3

-#define HAS_RGB24TOYROW_SSSE3

-#define HAS_RAWTOYROW_SSSE3

-#define HAS_RGB24TOUVROW_SSSE3

-#define HAS_RAWTOUVROW_SSSE3

-#define HAS_ARGBTOYROW_SSSE3

-#define HAS_BGRATOYROW_SSSE3

-#define HAS_ABGRTOYROW_SSSE3

-#define HAS_ARGBTOUVROW_SSSE3

-#define HAS_BGRATOUVROW_SSSE3

-#define HAS_ABGRTOUVROW_SSSE3

-#define HAS_I400TOARGBROW_SSE2

-#define HAS_FASTCONVERTYTOARGBROW_SSE2

-#define HAS_FASTCONVERTYUVTOARGBROW_SSSE3

-#define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3

-#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3

-#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3

-#define HAS_REVERSE_ROW_SSSE3

-#endif

-// The following are available on Neon platforms

-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)

-#define HAS_REVERSE_ROW_NEON

-#endif

-#ifdef __cplusplus

-namespace libyuv {

-extern "C" {

-#endif

-#ifdef HAS_ARGBTOYROW_SSSE3

-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width);

-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width);

-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width);

-#endif

-#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)

-#define HASRGB24TOYROW_SSSE3

-#endif

-#ifdef HASRGB24TOYROW_SSSE3

-void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);

-void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                        uint8* dst_u, uint8* dst_v, int width);

-void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width);

-#endif

-#ifdef HAS_REVERSE_ROW_SSSE3

-void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);

-#endif

-#ifdef HAS_REVERSE_ROW_NEON

-void ReverseRow_NEON(const uint8* src, uint8* dst, int width);

-#endif

-void ReverseRow_C(const uint8* src, uint8* dst, int width);

-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

-void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

-void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

-void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

-void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);

-void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,

-                   uint8* dst_u, uint8* dst_v, int width);

-void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,

-                   uint8* dst_u, uint8* dst_v, int width);

-void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,

-                   uint8* dst_u, uint8* dst_v, int width);

-void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,

-                    uint8* dst_u, uint8* dst_v, int width);

-void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,

-                  uint8* dst_u, uint8* dst_v, int width);

-#ifdef HAS_BG24TOARGBROW_SSSE3

-void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);

-void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);

-void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);

-void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);

-#endif

-void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);

-void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);

-void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);

-void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);

-#ifdef HAS_I400TOARGBROW_SSE2

-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);

-#endif

-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);

-#if defined(_MSC_VER)

-#define SIMD_ALIGNED(var) __declspec(align(16)) var

-typedef __declspec(align(16)) signed char vec8[16];

-typedef __declspec(align(16)) unsigned char uvec8[16];

-typedef __declspec(align(16)) signed short vec16[8];

-#else // __GNUC__

-#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))

-typedef signed char __attribute__((vector_size(16))) vec8;

-typedef unsigned char __attribute__((vector_size(16))) uvec8;

-typedef signed short __attribute__((vector_size(16))) vec16;

-#endif

-//extern "C"

-SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);

-//extern "C"

-SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);

-//extern "C"

-SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);

-void FastConvertYUVToARGBRow_C(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* rgb_buf,

-                               int width);

-void FastConvertYUVToBGRARow_C(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* rgb_buf,

-                               int width);

-void FastConvertYUVToABGRRow_C(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* rgb_buf,

-                               int width);

-void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  uint8* rgb_buf,

-                                  int width);

-void FastConvertYToARGBRow_C(const uint8* y_buf,

-                             uint8* rgb_buf,

-                             int width);

-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2

-void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  uint8* rgb_buf,

-                                  int width);

-void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,

-                                   const uint8* u_buf,

-                                   const uint8* v_buf,

-                                   uint8* rgb_buf,

-                                   int width);

-void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  uint8* rgb_buf,

-                                  int width);

-void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  uint8* rgb_buf,

-                                  int width);

-void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,

-                                     const uint8* u_buf,

-                                     const uint8* v_buf,

-                                     uint8* rgb_buf,

-                                     int width);

-void FastConvertYToARGBRow_SSE2(const uint8* y_buf,

-                                uint8* rgb_buf,

-                                int width);

-#endif

-#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3

-void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf,

-                                   const uint8* u_buf,

-                                   const uint8* v_buf,

-                                   uint8* rgb_buf,

-                                   int width);

-void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,

-                                   const uint8* u_buf,

-                                   const uint8* v_buf,

-                                   uint8* rgb_buf,

-                                   int width);

-void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,

-                                   const uint8* u_buf,

-                                   const uint8* v_buf,

-                                   uint8* rgb_buf,

-                                   int width);

-void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf,

-                                      const uint8* u_buf,

-                                      const uint8* v_buf,

-                                      uint8* rgb_buf,

-                                      int width);

-#endif

-#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2

-void FastConvertYToARGBRow_SSE2(const uint8* y_buf,

-                                uint8* rgb_buf,

-                                int width);

-#endif

-#ifdef __cplusplus

-}  // extern "C"

-}  // namespace libyuv

-#endif

-#endif  // LIBYUV_SOURCE_ROW_H_

--- /dev/null

+++ b/third_party/libyuv/source/row_any.cc

@@ -1,0 +1,542 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#include "third_party/libyuv/include/libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels.

+// TODO(fbarchard): Consider 'any' functions handling odd alignment.

+// YUV to RGB does multiple of 8 with SIMD and remainder with C.

+#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK)        \

+    void NAMEANY(const uint8* y_buf,                                           \

+                 const uint8* u_buf,                                           \

+                 const uint8* v_buf,                                           \

+                 uint8* rgb_buf,                                               \

+                 int width) {                                                  \

+      int n = width & ~MASK;                                                   \

+      I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n);                         \

+      I420TORGB_C(y_buf + n,                                                   \

+                  u_buf + (n >> UV_SHIFT),                                     \

+                  v_buf + (n >> UV_SHIFT),                                     \

+                  rgb_buf + n * BPP, width & MASK);                            \

+    }

+#ifdef HAS_I422TOARGBROW_SSSE3

+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,

+     0, 4, 7)

+YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,

+     1, 4, 7)

+YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,

+     2, 4, 7)

+YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,

+     1, 4, 7)

+YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C,

+     1, 4, 7)

+YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C,

+     1, 4, 7)

+// I422ToRGB565Row_SSSE3 is unaligned.

+YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C,

+     1, 2, 7)

+YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C,

+     1, 2, 7)

+YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C,

+     1, 2, 7)

+// I422ToRGB24Row_SSSE3 is unaligned.

+YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)

+YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)

+YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)

+YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)

+#endif  // HAS_I422TOARGBROW_SSSE3

+#ifdef HAS_I422TOARGBROW_AVX2

+YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)

+#endif  // HAS_I422TOARGBROW_AVX2

+#ifdef HAS_I422TOARGBROW_NEON

+YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7)

+YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7)

+YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7)

+YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7)

+YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7)

+YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7)

+YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7)

+YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7)

+YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C,

+     1, 2, 7)

+YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C,

+     1, 2, 7)

+YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7)

+YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15)

+YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15)

+#endif  // HAS_I422TOARGBROW_NEON

+#undef YANY

+// Wrappers to handle odd width

+#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP)             \

+    void NAMEANY(const uint8* y_buf,                                           \

+                 const uint8* uv_buf,                                          \

+                 uint8* rgb_buf,                                               \

+                 int width) {                                                  \

+      int n = width & ~7;                                                      \

+      NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n);                               \

+      NV12TORGB_C(y_buf + n,                                                   \

+                  uv_buf + (n >> UV_SHIFT),                                    \

+                  rgb_buf + n * BPP, width & 7);                               \

+    }

+#ifdef HAS_NV12TOARGBROW_SSSE3

+NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C,

+      0, 4)

+NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C,

+      0, 4)

+#endif  // HAS_NV12TOARGBROW_SSSE3

+#ifdef HAS_NV12TOARGBROW_NEON

+NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4)

+NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4)

+#endif  // HAS_NV12TOARGBROW_NEON

+#ifdef HAS_NV12TORGB565ROW_SSSE3

+NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C,

+      0, 2)

+NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C,

+      0, 2)

+#endif  // HAS_NV12TORGB565ROW_SSSE3

+#ifdef HAS_NV12TORGB565ROW_NEON

+NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2)

+NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2)

+#endif  // HAS_NV12TORGB565ROW_NEON

+#undef NVANY

+#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)          \

+    void NAMEANY(const uint8* src,                                             \

+                 uint8* dst,                                                   \

+                 int width) {                                                  \

+      int n = width & ~MASK;                                                   \

+      ARGBTORGB_SIMD(src, dst, n);                                             \

+      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK);                \

+    }

+#if defined(HAS_ARGBTORGB24ROW_SSSE3)

+RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C,

+       15, 4, 3)

+RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C,

+       15, 4, 3)

+RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C,

+       3, 4, 2)

+RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C,

+       3, 4, 2)

+RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,

+       3, 4, 2)

+#endif

+#if defined(HAS_I400TOARGBROW_SSE2)

+RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,

+       7, 1, 4)

+#endif

+#if defined(HAS_YTOARGBROW_SSE2)

+RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,

+       7, 1, 4)

+RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,

+       15, 2, 4)

+RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,

+       15, 2, 4)

+// These require alignment on ARGB, so C is used for remainder.

+RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C,

+       15, 3, 4)

+RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C,

+       15, 3, 4)

+RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C,

+       7, 2, 4)

+RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C,

+       7, 2, 4)

+RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C,

+       7, 2, 4)

+#endif

+#if defined(HAS_ARGBTORGB24ROW_NEON)

+RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3)

+RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 7, 4, 3)

+RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C,

+       7, 4, 2)

+RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C,

+       7, 4, 2)

+RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,

+       7, 4, 2)

+RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,

+       7, 1, 4)

+RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,

+       7, 1, 4)

+RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,

+       7, 2, 4)

+RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,

+       7, 2, 4)

+#endif

+#undef RGBANY

+// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.

+#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP)        \

+    void NAMEANY(const uint8* src,                                             \

+                 uint8* dst, uint32 selector,                                  \

+                 int width) {                                                  \

+      int n = width & ~MASK;                                                   \

+      ARGBTORGB_SIMD(src, dst, selector, n);                                   \

+      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK);      \

+    }

+#if defined(HAS_ARGBTOBAYERROW_SSSE3)

+BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C,

+         7, 4, 1)

+#endif

+#if defined(HAS_ARGBTOBAYERROW_NEON)

+BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,

+         7, 4, 1)

+#endif

+#if defined(HAS_ARGBTOBAYERGGROW_SSE2)

+BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,

+         7, 4, 1)

+#endif

+#if defined(HAS_ARGBTOBAYERGGROW_NEON)

+BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,

+         7, 4, 1)

+#endif

+#undef BAYERANY

+// RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD.

+#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM)                            \

+    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \

+      ARGBTOY_SIMD(src_argb, dst_y, width - NUM);                              \

+      ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP,                            \

+                   dst_y + (width - NUM) * BPP, NUM);                          \

+    }

+#ifdef HAS_ARGBTOYROW_AVX2

+YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32)

+YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32)

+YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32)

+YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32)

+#endif

+#ifdef HAS_ARGBTOYROW_SSSE3

+YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)

+#endif

+#ifdef HAS_BGRATOYROW_SSSE3

+YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)

+YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16)

+YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16)

+YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16)

+YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16)

+#endif

+#ifdef HAS_ARGBTOYJROW_SSSE3

+YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_Unaligned_SSSE3, 4, 1, 16)

+#endif

+#ifdef HAS_ARGBTOYROW_NEON

+YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8)

+YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 4, 1, 8)

+YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 4, 1, 8)

+YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 4, 1, 8)

+YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 4, 1, 8)

+YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 3, 1, 8)

+YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8)

+YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8)

+YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8)

+YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8)

+YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16)

+YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16)

+YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8)

+YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8)

+YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8)

+YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8)

+YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)

+#endif

+#undef YANY

+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \

+    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \

+      int n = width & ~MASK;                                                   \

+      ARGBTOY_SIMD(src_argb, dst_y, n);                                        \

+      ARGBTOY_C(src_argb + n * SBPP,                                           \

+                dst_y  + n * BPP, width & MASK);                               \

+    }

+// Attenuate is destructive so last16 method can not be used due to overlap.

+#ifdef HAS_ARGBATTENUATEROW_SSSE3

+YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,

+     4, 4, 3)

+#endif

+#ifdef HAS_ARGBATTENUATEROW_SSE2

+YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,

+     4, 4, 3)

+#endif

+#ifdef HAS_ARGBUNATTENUATEROW_SSE2

+YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C,

+     4, 4, 3)

+#endif

+#ifdef HAS_ARGBATTENUATEROW_AVX2

+YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C,

+     4, 4, 7)

+#endif

+#ifdef HAS_ARGBUNATTENUATEROW_AVX2

+YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C,

+     4, 4, 7)

+#endif

+#ifdef HAS_ARGBATTENUATEROW_NEON

+YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,

+     4, 4, 7)

+#endif

+#undef YANY

+// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.

+#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK)                     \

+    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \

+                 uint8* dst_u, uint8* dst_v, int width) {                      \

+      int n = width & ~MASK;                                                   \

+      ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n);                \

+      ANYTOUV_C(src_argb  + n * BPP, src_stride_argb,                          \

+                dst_u + (n >> 1),                                              \

+                dst_v + (n >> 1),                                              \

+                width & MASK);                                                 \

+    }

+#ifdef HAS_ARGBTOUVROW_AVX2

+UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31)

+UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31)

+UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31)

+#endif

+#ifdef HAS_ARGBTOUVROW_SSSE3

+UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)

+UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_Unaligned_SSSE3, ARGBToUVJRow_C,

+      4, 15)

+UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)

+UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)

+UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)

+UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)

+UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)

+#endif

+#ifdef HAS_ARGBTOUVROW_NEON

+UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)

+UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15)

+UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)

+UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)

+UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)

+UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)

+UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)

+UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)

+UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)

+UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)

+UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)

+UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)

+#endif

+#undef UVANY

+#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT)           \

+    void NAMEANY(const uint8* src_uv,                                          \

+                 uint8* dst_u, uint8* dst_v, int width) {                      \

+      int n = width & ~MASK;                                                   \

+      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \

+      ANYTOUV_C(src_uv  + n * BPP,                                             \

+                dst_u + (n >> SHIFT),                                          \

+                dst_v + (n >> SHIFT),                                          \

+                width & MASK);                                                 \

+    }

+#ifdef HAS_ARGBTOUV444ROW_SSSE3

+UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_Unaligned_SSSE3,

+         ARGBToUV444Row_C, 4, 15, 0)

+#endif

+#ifdef HAS_YUY2TOUV422ROW_AVX2

+UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2,

+         YUY2ToUV422Row_C, 2, 31, 1)

+UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2,

+         UYVYToUV422Row_C, 2, 31, 1)

+#endif

+#ifdef HAS_ARGBTOUVROW_SSSE3

+UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3,

+         ARGBToUV422Row_C, 4, 15, 1)

+UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2,

+         YUY2ToUV422Row_C, 2, 15, 1)

+UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2,

+         UYVYToUV422Row_C, 2, 15, 1)

+#endif

+#ifdef HAS_YUY2TOUV422ROW_NEON

+UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON,

+         ARGBToUV444Row_C, 4, 7, 0)

+UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON,

+         ARGBToUV422Row_C, 4, 15, 1)

+UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON,

+         ARGBToUV411Row_C, 4, 31, 2)

+UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON,

+         YUY2ToUV422Row_C, 2, 15, 1)

+UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,

+         UYVYToUV422Row_C, 2, 15, 1)

+#endif

+#undef UV422ANY

+#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                  \

+    void NAMEANY(const uint8* src_uv,                                          \

+                 uint8* dst_u, uint8* dst_v, int width) {                      \

+      int n = width & ~MASK;                                                   \

+      ANYTOUV_SIMD(src_uv, dst_u, dst_v, n);                                   \

+      ANYTOUV_C(src_uv + n * 2,                                                \

+                dst_u + n,                                                     \

+                dst_v + n,                                                     \

+                width & MASK);                                                 \

+    }

+#ifdef HAS_SPLITUVROW_SSE2

+SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)

+#endif

+#ifdef HAS_SPLITUVROW_AVX2

+SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31)

+#endif

+#ifdef HAS_SPLITUVROW_NEON

+SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)

+#endif

+#ifdef HAS_SPLITUVROW_MIPS_DSPR2

+SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,

+              SplitUVRow_C, 15)

+#endif

+#undef SPLITUVROWANY

+#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                 \

+    void NAMEANY(const uint8* src_u, const uint8* src_v,                       \

+                 uint8* dst_uv, int width) {                                   \

+      int n = width & ~MASK;                                                   \

+      ANYTOUV_SIMD(src_u, src_v, dst_uv, n);                                   \

+      ANYTOUV_C(src_u + n,                                                     \

+                src_v + n,                                                     \

+                dst_uv + n * 2,                                                \

+                width & MASK);                                                 \

+    }

+#ifdef HAS_MERGEUVROW_SSE2

+MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)

+#endif

+#ifdef HAS_MERGEUVROW_AVX2

+MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31)

+#endif

+#ifdef HAS_MERGEUVROW_NEON

+MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)

+#endif

+#undef MERGEUVROW_ANY

+#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK)                  \

+    void NAMEANY(const uint8* src_argb0, const uint8* src_argb1,               \

+                 uint8* dst_argb, int width) {                                 \

+      int n = width & ~MASK;                                                   \

+      ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n);                        \

+      ARGBMATH_C(src_argb0 + n * 4,                                            \

+                 src_argb1 + n * 4,                                            \

+                 dst_argb + n * 4,                                             \

+                 width & MASK);                                                \

+    }

+#ifdef HAS_ARGBMULTIPLYROW_SSE2

+MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C,

+            3)

+#endif

+#ifdef HAS_ARGBADDROW_SSE2

+MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)

+#endif

+#ifdef HAS_ARGBSUBTRACTROW_SSE2

+MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C,

+            3)

+#endif

+#ifdef HAS_ARGBMULTIPLYROW_AVX2

+MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C,

+            7)

+#endif

+#ifdef HAS_ARGBADDROW_AVX2

+MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7)

+#endif

+#ifdef HAS_ARGBSUBTRACTROW_AVX2

+MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C,

+            7)

+#endif

+#ifdef HAS_ARGBMULTIPLYROW_NEON

+MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C,

+            7)

+#endif

+#ifdef HAS_ARGBADDROW_NEON

+MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7)

+#endif

+#ifdef HAS_ARGBSUBTRACTROW_NEON

+MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C,

+            7)

+#endif

+#undef MATHROW_ANY

+// Shuffle may want to work in place, so last16 method can not be used.

+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \

+    void NAMEANY(const uint8* src_argb, uint8* dst_argb,                       \

+                 const uint8* shuffler, int width) {                           \

+      int n = width & ~MASK;                                                   \

+      ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n);                           \

+      ARGBTOY_C(src_argb + n * SBPP,                                           \

+                dst_argb  + n * BPP, shuffler, width & MASK);                  \

+    }

+#ifdef HAS_ARGBSHUFFLEROW_SSE2

+YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2,

+     ARGBShuffleRow_C, 4, 4, 3)

+#endif

+#ifdef HAS_ARGBSHUFFLEROW_SSSE3

+YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_Unaligned_SSSE3,

+     ARGBShuffleRow_C, 4, 4, 7)

+#endif

+#ifdef HAS_ARGBSHUFFLEROW_AVX2

+YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2,

+     ARGBShuffleRow_C, 4, 4, 15)

+#endif

+#ifdef HAS_ARGBSHUFFLEROW_NEON

+YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,

+     ARGBShuffleRow_C, 4, 4, 3)

+#endif

+#undef YANY

+// Interpolate may want to work in place, so last16 method can not be used.

+#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK)                      \

+    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \

+                 ptrdiff_t src_stride_ptr, int width,                          \

+                 int source_y_fraction) {                                      \

+      int n = width & ~MASK;                                                   \

+      TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr,                              \

+                n, source_y_fraction);                                         \

+      TERP_C(dst_ptr + n * BPP,                                                \

+             src_ptr + n * SBPP, src_stride_ptr,                               \

+             width & MASK, source_y_fraction);                                 \

+    }

+#ifdef HAS_INTERPOLATEROW_AVX2

+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,

+     InterpolateRow_C, 1, 1, 32)

+#endif

+#ifdef HAS_INTERPOLATEROW_SSSE3

+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_Unaligned_SSSE3,

+     InterpolateRow_C, 1, 1, 15)

+#endif

+#ifdef HAS_INTERPOLATEROW_SSE2

+NANY(InterpolateRow_Any_SSE2, InterpolateRow_Unaligned_SSE2,

+     InterpolateRow_C, 1, 1, 15)

+#endif

+#ifdef HAS_INTERPOLATEROW_NEON

+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,

+     InterpolateRow_C, 1, 1, 15)

+#endif

+#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2

+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,

+     InterpolateRow_C, 1, 1, 3)

+#endif

+#undef NANY

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/row_common.cc

@@ -1,0 +1,2286 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#include <string.h>  // For memcpy and memset.

+#include "third_party/libyuv/include/libyuv/basic_types.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// llvm x86 is poor at ternary operator, so use branchless min/max.

+#define USE_BRANCHLESS 1

+#if USE_BRANCHLESS

+static __inline int32 clamp0(int32 v) {

+  return ((-(v) >> 31) & (v));

+}

+static __inline int32 clamp255(int32 v) {

+  return (((255 - (v)) >> 31) | (v)) & 255;

+}

+static __inline uint32 Clamp(int32 val) {

+  int v = clamp0(val);

+  return (uint32)(clamp255(v));

+}

+static __inline uint32 Abs(int32 v) {

+  int m = v >> 31;

+  return (v + m) ^ m;

+}

+#else  // USE_BRANCHLESS

+static __inline int32 clamp0(int32 v) {

+  return (v < 0) ? 0 : v;

+}

+static __inline int32 clamp255(int32 v) {

+  return (v > 255) ? 255 : v;

+}

+static __inline uint32 Clamp(int32 val) {

+  int v = clamp0(val);

+  return (uint32)(clamp255(v));

+}

+static __inline uint32 Abs(int32 v) {

+  return (v < 0) ? -v : v;

+}

+#endif  // USE_BRANCHLESS

+#ifdef LIBYUV_LITTLE_ENDIAN

+#define WRITEWORD(p, v) *(uint32*)(p) = v

+#else

+static inline void WRITEWORD(uint8* p, uint32 v) {

+  p[0] = (uint8)(v & 255);

+  p[1] = (uint8)((v >> 8) & 255);

+  p[2] = (uint8)((v >> 16) & 255);

+  p[3] = (uint8)((v >> 24) & 255);

+}

+#endif

+void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_rgb24[0];

+    uint8 g = src_rgb24[1];

+    uint8 r = src_rgb24[2];

+    dst_argb[0] = b;

+    dst_argb[1] = g;

+    dst_argb[2] = r;

+    dst_argb[3] = 255u;

+    dst_argb += 4;

+    src_rgb24 += 3;

+  }

+}

+void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 r = src_raw[0];

+    uint8 g = src_raw[1];

+    uint8 b = src_raw[2];

+    dst_argb[0] = b;

+    dst_argb[1] = g;

+    dst_argb[2] = r;

+    dst_argb[3] = 255u;

+    dst_argb += 4;

+    src_raw += 3;

+  }

+}

+void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_rgb565[0] & 0x1f;

+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8 r = src_rgb565[1] >> 3;

+    dst_argb[0] = (b << 3) | (b >> 2);

+    dst_argb[1] = (g << 2) | (g >> 4);

+    dst_argb[2] = (r << 3) | (r >> 2);

+    dst_argb[3] = 255u;

+    dst_argb += 4;

+    src_rgb565 += 2;

+  }

+}

+void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,

+                         int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_argb1555[0] & 0x1f;

+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;

+    uint8 a = src_argb1555[1] >> 7;

+    dst_argb[0] = (b << 3) | (b >> 2);

+    dst_argb[1] = (g << 3) | (g >> 2);

+    dst_argb[2] = (r << 3) | (r >> 2);

+    dst_argb[3] = -a;

+    dst_argb += 4;

+    src_argb1555 += 2;

+  }

+}

+void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,

+                         int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_argb4444[0] & 0x0f;

+    uint8 g = src_argb4444[0] >> 4;

+    uint8 r = src_argb4444[1] & 0x0f;

+    uint8 a = src_argb4444[1] >> 4;

+    dst_argb[0] = (b << 4) | b;

+    dst_argb[1] = (g << 4) | g;

+    dst_argb[2] = (r << 4) | r;

+    dst_argb[3] = (a << 4) | a;

+    dst_argb += 4;

+    src_argb4444 += 2;

+  }

+}

+void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_argb[0];

+    uint8 g = src_argb[1];

+    uint8 r = src_argb[2];

+    dst_rgb[0] = b;

+    dst_rgb[1] = g;

+    dst_rgb[2] = r;

+    dst_rgb += 3;

+    src_argb += 4;

+  }

+}

+void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_argb[0];

+    uint8 g = src_argb[1];

+    uint8 r = src_argb[2];

+    dst_rgb[0] = r;

+    dst_rgb[1] = g;

+    dst_rgb[2] = b;

+    dst_rgb += 3;

+    src_argb += 4;

+  }

+}

+void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 b0 = src_argb[0] >> 3;

+    uint8 g0 = src_argb[1] >> 2;

+    uint8 r0 = src_argb[2] >> 3;

+    uint8 b1 = src_argb[4] >> 3;

+    uint8 g1 = src_argb[5] >> 2;

+    uint8 r1 = src_argb[6] >> 3;

+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |

+              (b1 << 16) | (g1 << 21) | (r1 << 27));

+    dst_rgb += 4;

+    src_argb += 8;

+  }

+  if (width & 1) {

+    uint8 b0 = src_argb[0] >> 3;

+    uint8 g0 = src_argb[1] >> 2;

+    uint8 r0 = src_argb[2] >> 3;

+    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);

+  }

+}

+void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 b0 = src_argb[0] >> 3;

+    uint8 g0 = src_argb[1] >> 3;

+    uint8 r0 = src_argb[2] >> 3;

+    uint8 a0 = src_argb[3] >> 7;

+    uint8 b1 = src_argb[4] >> 3;

+    uint8 g1 = src_argb[5] >> 3;

+    uint8 r1 = src_argb[6] >> 3;

+    uint8 a1 = src_argb[7] >> 7;

+    *(uint32*)(dst_rgb) =

+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |

+        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);

+    dst_rgb += 4;

+    src_argb += 8;

+  }

+  if (width & 1) {

+    uint8 b0 = src_argb[0] >> 3;

+    uint8 g0 = src_argb[1] >> 3;

+    uint8 r0 = src_argb[2] >> 3;

+    uint8 a0 = src_argb[3] >> 7;

+    *(uint16*)(dst_rgb) =

+        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);

+  }

+}

+void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 b0 = src_argb[0] >> 4;

+    uint8 g0 = src_argb[1] >> 4;

+    uint8 r0 = src_argb[2] >> 4;

+    uint8 a0 = src_argb[3] >> 4;

+    uint8 b1 = src_argb[4] >> 4;

+    uint8 g1 = src_argb[5] >> 4;

+    uint8 r1 = src_argb[6] >> 4;

+    uint8 a1 = src_argb[7] >> 4;

+    *(uint32*)(dst_rgb) =

+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |

+        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);

+    dst_rgb += 4;

+    src_argb += 8;

+  }

+  if (width & 1) {

+    uint8 b0 = src_argb[0] >> 4;

+    uint8 g0 = src_argb[1] >> 4;

+    uint8 r0 = src_argb[2] >> 4;

+    uint8 a0 = src_argb[3] >> 4;

+    *(uint16*)(dst_rgb) =

+        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);

+  }

+}

+static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {

+  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;

+}

+static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {

+  return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;

+}

+static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {

+  return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;

+}

+#define MAKEROWY(NAME, R, G, B, BPP) \

+void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \

+  int x;                                                                       \

+  for (x = 0; x < width; ++x) {                                                \

+    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \

+    src_argb0 += BPP;                                                          \

+    dst_y += 1;                                                                \

+  }                                                                            \

+}                                                                              \

+void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \

+                       uint8* dst_u, uint8* dst_v, int width) {                \

+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \

+  int x;                                                                       \

+  for (x = 0; x < width - 1; x += 2) {                                         \

+    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \

+               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \

+    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \

+               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \

+    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \

+               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \

+    dst_u[0] = RGBToU(ar, ag, ab);                                             \

+    dst_v[0] = RGBToV(ar, ag, ab);                                             \

+    src_rgb0 += BPP * 2;                                                       \

+    src_rgb1 += BPP * 2;                                                       \

+    dst_u += 1;                                                                \

+    dst_v += 1;                                                                \

+  }                                                                            \

+  if (width & 1) {                                                             \

+    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \

+    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \

+    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \

+    dst_u[0] = RGBToU(ar, ag, ab);                                             \

+    dst_v[0] = RGBToV(ar, ag, ab);                                             \

+  }                                                                            \

+}

+MAKEROWY(ARGB, 2, 1, 0, 4)

+MAKEROWY(BGRA, 1, 2, 3, 4)

+MAKEROWY(ABGR, 0, 1, 2, 4)

+MAKEROWY(RGBA, 3, 2, 1, 4)

+MAKEROWY(RGB24, 2, 1, 0, 3)

+MAKEROWY(RAW, 0, 1, 2, 3)

+#undef MAKEROWY

+// JPeg uses a variation on BT.601-1 full range

+// y =  0.29900 * r + 0.58700 * g + 0.11400 * b

+// u = -0.16874 * r - 0.33126 * g + 0.50000 * b  + center

+// v =  0.50000 * r - 0.41869 * g - 0.08131 * b  + center

+// BT.601 Mpeg range uses:

+// b 0.1016 * 255 = 25.908 = 25

+// g 0.5078 * 255 = 129.489 = 129

+// r 0.2578 * 255 = 65.739 = 66

+// JPeg 8 bit Y (not used):

+// b 0.11400 * 256 = 29.184 = 29

+// g 0.58700 * 256 = 150.272 = 150

+// r 0.29900 * 256 = 76.544 = 77

+// JPeg 7 bit Y:

+// b 0.11400 * 128 = 14.592 = 15

+// g 0.58700 * 128 = 75.136 = 75

+// r 0.29900 * 128 = 38.272 = 38

+// JPeg 8 bit U:

+// b  0.50000 * 255 = 127.5 = 127

+// g -0.33126 * 255 = -84.4713 = -84

+// r -0.16874 * 255 = -43.0287 = -43

+// JPeg 8 bit V:

+// b -0.08131 * 255 = -20.73405 = -20

+// g -0.41869 * 255 = -106.76595 = -107

+// r  0.50000 * 255 = 127.5 = 127

+static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {

+  return (38 * r + 75 * g +  15 * b + 64) >> 7;

+}

+static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {

+  return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;

+}

+static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {

+  return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;

+}

+#define AVGB(a, b) (((a) + (b) + 1) >> 1)

+#define MAKEROWYJ(NAME, R, G, B, BPP) \

+void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \

+  int x;                                                                       \

+  for (x = 0; x < width; ++x) {                                                \

+    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \

+    src_argb0 += BPP;                                                          \

+    dst_y += 1;                                                                \

+  }                                                                            \

+}                                                                              \

+void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \

+                        uint8* dst_u, uint8* dst_v, int width) {               \

+  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \

+  int x;                                                                       \

+  for (x = 0; x < width - 1; x += 2) {                                         \

+    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \

+                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \

+    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \

+                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \

+    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \

+                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \

+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \

+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \

+    src_rgb0 += BPP * 2;                                                       \

+    src_rgb1 += BPP * 2;                                                       \

+    dst_u += 1;                                                                \

+    dst_v += 1;                                                                \

+  }                                                                            \

+  if (width & 1) {                                                             \

+    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \

+    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \

+    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \

+    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \

+    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \

+  }                                                                            \

+}

+MAKEROWYJ(ARGB, 2, 1, 0, 4)

+#undef MAKEROWYJ

+void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_rgb565[0] & 0x1f;

+    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8 r = src_rgb565[1] >> 3;

+    b = (b << 3) | (b >> 2);

+    g = (g << 2) | (g >> 4);

+    r = (r << 3) | (r >> 2);

+    dst_y[0] = RGBToY(r, g, b);

+    src_rgb565 += 2;

+    dst_y += 1;

+  }

+}

+void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_argb1555[0] & 0x1f;

+    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8 r = (src_argb1555[1] & 0x7c) >> 2;

+    b = (b << 3) | (b >> 2);

+    g = (g << 3) | (g >> 2);

+    r = (r << 3) | (r >> 2);

+    dst_y[0] = RGBToY(r, g, b);

+    src_argb1555 += 2;

+    dst_y += 1;

+  }

+}

+void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 b = src_argb4444[0] & 0x0f;

+    uint8 g = src_argb4444[0] >> 4;

+    uint8 r = src_argb4444[1] & 0x0f;

+    b = (b << 4) | b;

+    g = (g << 4) | g;

+    r = (r << 4) | r;

+    dst_y[0] = RGBToY(r, g, b);

+    src_argb4444 += 2;

+    dst_y += 1;

+  }

+}

+void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,

+                     uint8* dst_u, uint8* dst_v, int width) {

+  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 b0 = src_rgb565[0] & 0x1f;

+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8 r0 = src_rgb565[1] >> 3;

+    uint8 b1 = src_rgb565[2] & 0x1f;

+    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);

+    uint8 r1 = src_rgb565[3] >> 3;

+    uint8 b2 = next_rgb565[0] & 0x1f;

+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);

+    uint8 r2 = next_rgb565[1] >> 3;

+    uint8 b3 = next_rgb565[2] & 0x1f;

+    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);

+    uint8 r3 = next_rgb565[3] >> 3;

+    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.

+    uint8 g = (g0 + g1 + g2 + g3);

+    uint8 r = (r0 + r1 + r2 + r3);

+    b = (b << 1) | (b >> 6);  // 787 -> 888.

+    r = (r << 1) | (r >> 6);

+    dst_u[0] = RGBToU(r, g, b);

+    dst_v[0] = RGBToV(r, g, b);

+    src_rgb565 += 4;

+    next_rgb565 += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+  if (width & 1) {

+    uint8 b0 = src_rgb565[0] & 0x1f;

+    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8 r0 = src_rgb565[1] >> 3;

+    uint8 b2 = next_rgb565[0] & 0x1f;

+    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);

+    uint8 r2 = next_rgb565[1] >> 3;

+    uint8 b = (b0 + b2);  // 565 * 2 = 676.

+    uint8 g = (g0 + g2);

+    uint8 r = (r0 + r2);

+    b = (b << 2) | (b >> 4);  // 676 -> 888

+    g = (g << 1) | (g >> 6);

+    r = (r << 2) | (r >> 4);

+    dst_u[0] = RGBToU(r, g, b);

+    dst_v[0] = RGBToV(r, g, b);

+  }

+}

+void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 b0 = src_argb1555[0] & 0x1f;

+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;

+    uint8 b1 = src_argb1555[2] & 0x1f;

+    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);

+    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;

+    uint8 b2 = next_argb1555[0] & 0x1f;

+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);

+    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;

+    uint8 b3 = next_argb1555[2] & 0x1f;

+    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);

+    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;

+    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.

+    uint8 g = (g0 + g1 + g2 + g3);

+    uint8 r = (r0 + r1 + r2 + r3);

+    b = (b << 1) | (b >> 6);  // 777 -> 888.

+    g = (g << 1) | (g >> 6);

+    r = (r << 1) | (r >> 6);

+    dst_u[0] = RGBToU(r, g, b);

+    dst_v[0] = RGBToV(r, g, b);

+    src_argb1555 += 4;

+    next_argb1555 += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+  if (width & 1) {

+    uint8 b0 = src_argb1555[0] & 0x1f;

+    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;

+    uint8 b2 = next_argb1555[0] & 0x1f;

+    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);

+    uint8 r2 = next_argb1555[1] >> 3;

+    uint8 b = (b0 + b2);  // 555 * 2 = 666.

+    uint8 g = (g0 + g2);

+    uint8 r = (r0 + r2);

+    b = (b << 2) | (b >> 4);  // 666 -> 888.

+    g = (g << 2) | (g >> 4);

+    r = (r << 2) | (r >> 4);

+    dst_u[0] = RGBToU(r, g, b);

+    dst_v[0] = RGBToV(r, g, b);

+  }

+}

+void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 b0 = src_argb4444[0] & 0x0f;

+    uint8 g0 = src_argb4444[0] >> 4;

+    uint8 r0 = src_argb4444[1] & 0x0f;

+    uint8 b1 = src_argb4444[2] & 0x0f;

+    uint8 g1 = src_argb4444[2] >> 4;

+    uint8 r1 = src_argb4444[3] & 0x0f;

+    uint8 b2 = next_argb4444[0] & 0x0f;

+    uint8 g2 = next_argb4444[0] >> 4;

+    uint8 r2 = next_argb4444[1] & 0x0f;

+    uint8 b3 = next_argb4444[2] & 0x0f;

+    uint8 g3 = next_argb4444[2] >> 4;

+    uint8 r3 = next_argb4444[3] & 0x0f;

+    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.

+    uint8 g = (g0 + g1 + g2 + g3);

+    uint8 r = (r0 + r1 + r2 + r3);

+    b = (b << 2) | (b >> 4);  // 666 -> 888.

+    g = (g << 2) | (g >> 4);

+    r = (r << 2) | (r >> 4);

+    dst_u[0] = RGBToU(r, g, b);

+    dst_v[0] = RGBToV(r, g, b);

+    src_argb4444 += 4;

+    next_argb4444 += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+  if (width & 1) {

+    uint8 b0 = src_argb4444[0] & 0x0f;

+    uint8 g0 = src_argb4444[0] >> 4;

+    uint8 r0 = src_argb4444[1] & 0x0f;

+    uint8 b2 = next_argb4444[0] & 0x0f;

+    uint8 g2 = next_argb4444[0] >> 4;

+    uint8 r2 = next_argb4444[1] & 0x0f;

+    uint8 b = (b0 + b2);  // 444 * 2 = 555.

+    uint8 g = (g0 + g2);

+    uint8 r = (r0 + r2);

+    b = (b << 3) | (b >> 2);  // 555 -> 888.

+    g = (g << 3) | (g >> 2);

+    r = (r << 3) | (r >> 2);

+    dst_u[0] = RGBToU(r, g, b);

+    dst_v[0] = RGBToV(r, g, b);

+  }

+}

+void ARGBToUV444Row_C(const uint8* src_argb,

+                      uint8* dst_u, uint8* dst_v, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 ab = src_argb[0];

+    uint8 ag = src_argb[1];

+    uint8 ar = src_argb[2];

+    dst_u[0] = RGBToU(ar, ag, ab);

+    dst_v[0] = RGBToV(ar, ag, ab);

+    src_argb += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+}

+void ARGBToUV422Row_C(const uint8* src_argb,

+                      uint8* dst_u, uint8* dst_v, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;

+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;

+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;

+    dst_u[0] = RGBToU(ar, ag, ab);

+    dst_v[0] = RGBToV(ar, ag, ab);

+    src_argb += 8;

+    dst_u += 1;

+    dst_v += 1;

+  }

+  if (width & 1) {

+    uint8 ab = src_argb[0];

+    uint8 ag = src_argb[1];

+    uint8 ar = src_argb[2];

+    dst_u[0] = RGBToU(ar, ag, ab);

+    dst_v[0] = RGBToV(ar, ag, ab);

+  }

+}

+void ARGBToUV411Row_C(const uint8* src_argb,

+                      uint8* dst_u, uint8* dst_v, int width) {

+  int x;

+  for (x = 0; x < width - 3; x += 4) {

+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;

+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;

+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;

+    dst_u[0] = RGBToU(ar, ag, ab);

+    dst_v[0] = RGBToV(ar, ag, ab);

+    src_argb += 16;

+    dst_u += 1;

+    dst_v += 1;

+  }

+  if ((width & 3) == 3) {

+    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8]) / 3;

+    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9]) / 3;

+    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10]) / 3;

+    dst_u[0] = RGBToU(ar, ag, ab);

+    dst_v[0] = RGBToV(ar, ag, ab);

+  } else if ((width & 3) == 2) {

+    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;

+    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;

+    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;

+    dst_u[0] = RGBToU(ar, ag, ab);

+    dst_v[0] = RGBToV(ar, ag, ab);

+  } else if ((width & 3) == 1) {

+    uint8 ab = src_argb[0];

+    uint8 ag = src_argb[1];

+    uint8 ar = src_argb[2];

+    dst_u[0] = RGBToU(ar, ag, ab);

+    dst_v[0] = RGBToV(ar, ag, ab);

+  }

+}

+void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);

+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;

+    dst_argb[3] = src_argb[3];

+    dst_argb += 4;

+    src_argb += 4;

+  }

+}

+// Convert a row of image to Sepia tone.

+void ARGBSepiaRow_C(uint8* dst_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    int b = dst_argb[0];

+    int g = dst_argb[1];

+    int r = dst_argb[2];

+    int sb = (b * 17 + g * 68 + r * 35) >> 7;

+    int sg = (b * 22 + g * 88 + r * 45) >> 7;

+    int sr = (b * 24 + g * 98 + r * 50) >> 7;

+    // b does not over flow. a is preserved from original.

+    dst_argb[0] = sb;

+    dst_argb[1] = clamp255(sg);

+    dst_argb[2] = clamp255(sr);

+    dst_argb += 4;

+  }

+}

+// Apply color matrix to a row of image. Matrix is signed.

+// TODO(fbarchard): Consider adding rounding (+32).

+void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,

+                          const int8* matrix_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    int b = src_argb[0];

+    int g = src_argb[1];

+    int r = src_argb[2];

+    int a = src_argb[3];

+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +

+              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;

+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +

+              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;

+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +

+              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;

+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +

+              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;

+    dst_argb[0] = Clamp(sb);

+    dst_argb[1] = Clamp(sg);

+    dst_argb[2] = Clamp(sr);

+    dst_argb[3] = Clamp(sa);

+    src_argb += 4;

+    dst_argb += 4;

+  }

+}

+// Apply color table to a row of image.

+void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    int b = dst_argb[0];

+    int g = dst_argb[1];

+    int r = dst_argb[2];

+    int a = dst_argb[3];

+    dst_argb[0] = table_argb[b * 4 + 0];

+    dst_argb[1] = table_argb[g * 4 + 1];

+    dst_argb[2] = table_argb[r * 4 + 2];

+    dst_argb[3] = table_argb[a * 4 + 3];

+    dst_argb += 4;

+  }

+}

+// Apply color table to a row of image.

+void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    int b = dst_argb[0];

+    int g = dst_argb[1];

+    int r = dst_argb[2];

+    dst_argb[0] = table_argb[b * 4 + 0];

+    dst_argb[1] = table_argb[g * 4 + 1];

+    dst_argb[2] = table_argb[r * 4 + 2];

+    dst_argb += 4;

+  }

+}

+void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,

+                       int interval_offset, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    int b = dst_argb[0];

+    int g = dst_argb[1];

+    int r = dst_argb[2];

+    dst_argb[0] = (b * scale >> 16) * interval_size + interval_offset;

+    dst_argb[1] = (g * scale >> 16) * interval_size + interval_offset;

+    dst_argb[2] = (r * scale >> 16) * interval_size + interval_offset;

+    dst_argb += 4;

+  }

+}

+#define REPEAT8(v) (v) | ((v) << 8)

+#define SHADE(f, v) v * f >> 24

+void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,

+                    uint32 value) {

+  const uint32 b_scale = REPEAT8(value & 0xff);

+  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);

+  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);

+  const uint32 a_scale = REPEAT8(value >> 24);

+  int i;

+  for (i = 0; i < width; ++i) {

+    const uint32 b = REPEAT8(src_argb[0]);

+    const uint32 g = REPEAT8(src_argb[1]);

+    const uint32 r = REPEAT8(src_argb[2]);

+    const uint32 a = REPEAT8(src_argb[3]);

+    dst_argb[0] = SHADE(b, b_scale);

+    dst_argb[1] = SHADE(g, g_scale);

+    dst_argb[2] = SHADE(r, r_scale);

+    dst_argb[3] = SHADE(a, a_scale);

+    src_argb += 4;

+    dst_argb += 4;

+  }

+}

+#undef REPEAT8

+#undef SHADE

+#define REPEAT8(v) (v) | ((v) << 8)

+#define SHADE(f, v) v * f >> 16

+void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,

+                       uint8* dst_argb, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    const uint32 b = REPEAT8(src_argb0[0]);

+    const uint32 g = REPEAT8(src_argb0[1]);

+    const uint32 r = REPEAT8(src_argb0[2]);

+    const uint32 a = REPEAT8(src_argb0[3]);

+    const uint32 b_scale = src_argb1[0];

+    const uint32 g_scale = src_argb1[1];

+    const uint32 r_scale = src_argb1[2];

+    const uint32 a_scale = src_argb1[3];

+    dst_argb[0] = SHADE(b, b_scale);

+    dst_argb[1] = SHADE(g, g_scale);

+    dst_argb[2] = SHADE(r, r_scale);

+    dst_argb[3] = SHADE(a, a_scale);

+    src_argb0 += 4;

+    src_argb1 += 4;

+    dst_argb += 4;

+  }

+}

+#undef REPEAT8

+#undef SHADE

+#define SHADE(f, v) clamp255(v + f)

+void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,

+                  uint8* dst_argb, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    const int b = src_argb0[0];

+    const int g = src_argb0[1];

+    const int r = src_argb0[2];

+    const int a = src_argb0[3];

+    const int b_add = src_argb1[0];

+    const int g_add = src_argb1[1];

+    const int r_add = src_argb1[2];

+    const int a_add = src_argb1[3];

+    dst_argb[0] = SHADE(b, b_add);

+    dst_argb[1] = SHADE(g, g_add);

+    dst_argb[2] = SHADE(r, r_add);

+    dst_argb[3] = SHADE(a, a_add);

+    src_argb0 += 4;

+    src_argb1 += 4;

+    dst_argb += 4;

+  }

+}

+#undef SHADE

+#define SHADE(f, v) clamp0(f - v)

+void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,

+                       uint8* dst_argb, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    const int b = src_argb0[0];

+    const int g = src_argb0[1];

+    const int r = src_argb0[2];

+    const int a = src_argb0[3];

+    const int b_sub = src_argb1[0];

+    const int g_sub = src_argb1[1];

+    const int r_sub = src_argb1[2];

+    const int a_sub = src_argb1[3];

+    dst_argb[0] = SHADE(b, b_sub);

+    dst_argb[1] = SHADE(g, g_sub);

+    dst_argb[2] = SHADE(r, r_sub);

+    dst_argb[3] = SHADE(a, a_sub);

+    src_argb0 += 4;

+    src_argb1 += 4;

+    dst_argb += 4;

+  }

+}

+#undef SHADE

+// Sobel functions which mimics SSSE3.

+void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,

+                 uint8* dst_sobelx, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    int a = src_y0[i];

+    int b = src_y1[i];

+    int c = src_y2[i];

+    int a_sub = src_y0[i + 2];

+    int b_sub = src_y1[i + 2];

+    int c_sub = src_y2[i + 2];

+    int a_diff = a - a_sub;

+    int b_diff = b - b_sub;

+    int c_diff = c - c_sub;

+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);

+    dst_sobelx[i] = (uint8)(clamp255(sobel));

+  }

+}

+void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,

+                 uint8* dst_sobely, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    int a = src_y0[i + 0];

+    int b = src_y0[i + 1];

+    int c = src_y0[i + 2];

+    int a_sub = src_y1[i + 0];

+    int b_sub = src_y1[i + 1];

+    int c_sub = src_y1[i + 2];

+    int a_diff = a - a_sub;

+    int b_diff = b - b_sub;

+    int c_diff = c - c_sub;

+    int sobel = Abs(a_diff + b_diff * 2 + c_diff);

+    dst_sobely[i] = (uint8)(clamp255(sobel));

+  }

+}

+void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,

+                uint8* dst_argb, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    int r = src_sobelx[i];

+    int b = src_sobely[i];

+    int s = clamp255(r + b);

+    dst_argb[0] = (uint8)(s);

+    dst_argb[1] = (uint8)(s);

+    dst_argb[2] = (uint8)(s);

+    dst_argb[3] = (uint8)(255u);

+    dst_argb += 4;

+  }

+}

+void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,

+                       uint8* dst_y, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    int r = src_sobelx[i];

+    int b = src_sobely[i];

+    int s = clamp255(r + b);

+    dst_y[i] = (uint8)(s);

+  }

+}

+void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,

+                  uint8* dst_argb, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    int r = src_sobelx[i];

+    int b = src_sobely[i];

+    int g = clamp255(r + b);

+    dst_argb[0] = (uint8)(b);

+    dst_argb[1] = (uint8)(g);

+    dst_argb[2] = (uint8)(r);

+    dst_argb[3] = (uint8)(255u);

+    dst_argb += 4;

+  }

+}

+void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {

+  // Copy a Y to RGB.

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8 y = src_y[0];

+    dst_argb[2] = dst_argb[1] = dst_argb[0] = y;

+    dst_argb[3] = 255u;

+    dst_argb += 4;

+    ++src_y;

+  }

+}

+// C reference code that mimics the YUV assembly.

+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */

+#define UB 127 /* min(63,(int8)(2.018 * 64)) */

+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */

+#define UR 0

+#define VB 0

+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */

+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */

+// Bias

+#define BB UB * 128 + VB * 128

+#define BG UG * 128 + VG * 128

+#define BR UR * 128 + VR * 128

+static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,

+                              uint8* b, uint8* g, uint8* r) {

+  int32 y1 = ((int32)(y) - 16) * YG;

+  *b = Clamp((int32)((u * UB + v * VB) - (BB) + y1) >> 6);

+  *g = Clamp((int32)((u * UG + v * VG) - (BG) + y1) >> 6);

+  *r = Clamp((int32)((u * UR + v * VR) - (BR) + y1) >> 6);

+}

+#if !defined(LIBYUV_DISABLE_NEON) && \

+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))

+// C mimic assembly.

+// TODO(fbarchard): Remove subsampling from Neon.

+void I444ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;

+    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;

+    YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    src_u += 2;

+    src_v += 2;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+  }

+}

+#else

+void I444ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    src_y += 1;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 4;  // Advance 1 pixel.

+  }

+}

+#endif

+// Also used for 420

+void I422ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+  }

+}

+void I422ToRGB24Row_C(const uint8* src_y,

+                      const uint8* src_u,

+                      const uint8* src_v,

+                      uint8* rgb_buf,

+                      int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5);

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 6;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+  }

+}

+void I422ToRAWRow_C(const uint8* src_y,

+                    const uint8* src_u,

+                    const uint8* src_v,

+                    uint8* rgb_buf,

+                    int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 5, rgb_buf + 4, rgb_buf + 3);

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 6;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

+  }

+}

+void I422ToARGB4444Row_C(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb4444,

+                         int width) {

+  uint8 b0;

+  uint8 g0;

+  uint8 r0;

+  uint8 b1;

+  uint8 g1;

+  uint8 r1;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);

+    b0 = b0 >> 4;

+    g0 = g0 >> 4;

+    r0 = r0 >> 4;

+    b1 = b1 >> 4;

+    g1 = g1 >> 4;

+    r1 = r1 >> 4;

+    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |

+        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    dst_argb4444 += 4;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    b0 = b0 >> 4;

+    g0 = g0 >> 4;

+    r0 = r0 >> 4;

+    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |

+        0xf000;

+  }

+}

+void I422ToARGB1555Row_C(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_argb1555,

+                         int width) {

+  uint8 b0;

+  uint8 g0;

+  uint8 r0;

+  uint8 b1;

+  uint8 g1;

+  uint8 r1;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);

+    b0 = b0 >> 3;

+    g0 = g0 >> 3;

+    r0 = r0 >> 3;

+    b1 = b1 >> 3;

+    g1 = g1 >> 3;

+    r1 = r1 >> 3;

+    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |

+        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    dst_argb1555 += 4;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    b0 = b0 >> 3;

+    g0 = g0 >> 3;

+    r0 = r0 >> 3;

+    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |

+        0x8000;

+  }

+}

+void I422ToRGB565Row_C(const uint8* src_y,

+                       const uint8* src_u,

+                       const uint8* src_v,

+                       uint8* dst_rgb565,

+                       int width) {

+  uint8 b0;

+  uint8 g0;

+  uint8 r0;

+  uint8 b1;

+  uint8 g1;

+  uint8 r1;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1);

+    b0 = b0 >> 3;

+    g0 = g0 >> 2;

+    r0 = r0 >> 3;

+    b1 = b1 >> 3;

+    g1 = g1 >> 2;

+    r1 = r1 >> 3;

+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |

+        (b1 << 16) | (g1 << 21) | (r1 << 27);

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    dst_rgb565 += 4;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0);

+    b0 = b0 >> 3;

+    g0 = g0 >> 2;

+    r0 = r0 >> 3;

+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

+  }

+}

+void I411ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 3; x += 4) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    YuvPixel(src_y[2], src_u[0], src_v[0],

+             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10);

+    rgb_buf[11] = 255;

+    YuvPixel(src_y[3], src_u[0], src_v[0],

+             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14);

+    rgb_buf[15] = 255;

+    src_y += 4;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 16;  // Advance 4 pixels.

+  }

+  if (width & 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+  }

+}

+void NV12ToARGBRow_C(const uint8* src_y,

+                     const uint8* usrc_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], usrc_v[0], usrc_v[1],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    usrc_v += 2;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], usrc_v[0], usrc_v[1],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+  }

+}

+void NV21ToARGBRow_C(const uint8* src_y,

+                     const uint8* src_vu,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_vu[1], src_vu[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], src_vu[1], src_vu[0],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    src_vu += 2;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_vu[1], src_vu[0],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+  }

+}

+void NV12ToRGB565Row_C(const uint8* src_y,

+                       const uint8* usrc_v,

+                       uint8* dst_rgb565,

+                       int width) {

+  uint8 b0;

+  uint8 g0;

+  uint8 r0;

+  uint8 b1;

+  uint8 g1;

+  uint8 r1;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);

+    YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1);

+    b0 = b0 >> 3;

+    g0 = g0 >> 2;

+    r0 = r0 >> 3;

+    b1 = b1 >> 3;

+    g1 = g1 >> 2;

+    r1 = r1 >> 3;

+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |

+        (b1 << 16) | (g1 << 21) | (r1 << 27);

+    src_y += 2;

+    usrc_v += 2;

+    dst_rgb565 += 4;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0);

+    b0 = b0 >> 3;

+    g0 = g0 >> 2;

+    r0 = r0 >> 3;

+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

+  }

+}

+void NV21ToRGB565Row_C(const uint8* src_y,

+                       const uint8* vsrc_u,

+                       uint8* dst_rgb565,

+                       int width) {

+  uint8 b0;

+  uint8 g0;

+  uint8 r0;

+  uint8 b1;

+  uint8 g1;

+  uint8 r1;

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);

+    YuvPixel(src_y[1], vsrc_u[1], vsrc_u[0], &b1, &g1, &r1);

+    b0 = b0 >> 3;

+    g0 = g0 >> 2;

+    r0 = r0 >> 3;

+    b1 = b1 >> 3;

+    g1 = g1 >> 2;

+    r1 = r1 >> 3;

+    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |

+        (b1 << 16) | (g1 << 21) | (r1 << 27);

+    src_y += 2;

+    vsrc_u += 2;

+    dst_rgb565 += 4;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], vsrc_u[1], vsrc_u[0], &b0, &g0, &r0);

+    b0 = b0 >> 3;

+    g0 = g0 >> 2;

+    r0 = r0 >> 3;

+    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

+  }

+}

+void YUY2ToARGBRow_C(const uint8* src_yuy2,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_yuy2 += 4;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+  }

+}

+void UYVYToARGBRow_C(const uint8* src_uyvy,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_uyvy += 4;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+  }

+}

+void I422ToBGRARow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);

+    rgb_buf[0] = 255;

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 7, rgb_buf + 6, rgb_buf + 5);

+    rgb_buf[4] = 255;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 3, rgb_buf + 2, rgb_buf + 1);

+    rgb_buf[0] = 255;

+  }

+}

+void I422ToABGRRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 6, rgb_buf + 5, rgb_buf + 4);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 2, rgb_buf + 1, rgb_buf + 0);

+    rgb_buf[3] = 255;

+  }

+}

+void I422ToRGBARow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* rgb_buf,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);

+    rgb_buf[0] = 255;

+    YuvPixel(src_y[1], src_u[0], src_v[0],

+             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7);

+    rgb_buf[4] = 255;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_u[0], src_v[0],

+             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3);

+    rgb_buf[0] = 255;

+  }

+}

+void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], 128, 128,

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+    YuvPixel(src_y[1], 128, 128,

+             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], 128, 128,

+             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

+    rgb_buf[3] = 255;

+  }

+}

+void MirrorRow_C(const uint8* src, uint8* dst, int width) {

+  int x;

+  src += width - 1;

+  for (x = 0; x < width - 1; x += 2) {

+    dst[x] = src[0];

+    dst[x + 1] = src[-1];

+    src -= 2;

+  }

+  if (width & 1) {

+    dst[width - 1] = src[0];

+  }

+}

+void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {

+  int x;

+  src_uv += (width - 1) << 1;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_u[x] = src_uv[0];

+    dst_u[x + 1] = src_uv[-2];

+    dst_v[x] = src_uv[1];

+    dst_v[x + 1] = src_uv[-2 + 1];

+    src_uv -= 4;

+  }

+  if (width & 1) {

+    dst_u[width - 1] = src_uv[0];

+    dst_v[width - 1] = src_uv[1];

+  }

+}

+void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {

+  int x;

+  const uint32* src32 = (const uint32*)(src);

+  uint32* dst32 = (uint32*)(dst);

+  src32 += width - 1;

+  for (x = 0; x < width - 1; x += 2) {

+    dst32[x] = src32[0];

+    dst32[x + 1] = src32[-1];

+    src32 -= 2;

+  }

+  if (width & 1) {

+    dst32[width - 1] = src32[0];

+  }

+}

+void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_u[x] = src_uv[0];

+    dst_u[x + 1] = src_uv[2];

+    dst_v[x] = src_uv[1];

+    dst_v[x + 1] = src_uv[3];

+    src_uv += 4;

+  }

+  if (width & 1) {

+    dst_u[width - 1] = src_uv[0];

+    dst_v[width - 1] = src_uv[1];

+  }

+}

+void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                  int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_uv[0] = src_u[x];

+    dst_uv[1] = src_v[x];

+    dst_uv[2] = src_u[x + 1];

+    dst_uv[3] = src_v[x + 1];

+    dst_uv += 4;

+  }

+  if (width & 1) {

+    dst_uv[0] = src_u[width - 1];

+    dst_uv[1] = src_v[width - 1];

+  }

+}

+void CopyRow_C(const uint8* src, uint8* dst, int count) {

+  memcpy(dst, src, count);

+}

+void CopyRow_16_C(const uint16* src, uint16* dst, int count) {

+  memcpy(dst, src, count * 2);

+}

+void SetRow_C(uint8* dst, uint32 v8, int count) {

+#ifdef _MSC_VER

+  // VC will generate rep stosb.

+  int x;

+  for (x = 0; x < count; ++x) {

+    dst[x] = v8;

+  }

+#else

+  memset(dst, v8, count);

+#endif

+}

+void ARGBSetRows_C(uint8* dst, uint32 v32, int width,

+                 int dst_stride, int height) {

+  int y;

+  for (y = 0; y < height; ++y) {

+    uint32* d = (uint32*)(dst);

+    int x;

+    for (x = 0; x < width; ++x) {

+      d[x] = v32;

+    }

+    dst += dst_stride;

+  }

+}

+// Filter 2 rows of YUY2 UV's (422) into U and V (420).

+void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,

+                   uint8* dst_u, uint8* dst_v, int width) {

+  // Output a row of UV values, filtering 2 rows of YUY2.

+  int x;

+  for (x = 0; x < width; x += 2) {

+    dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;

+    dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;

+    src_yuy2 += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+}

+// Copy row of YUY2 UV's (422) into U and V (422).

+void YUY2ToUV422Row_C(const uint8* src_yuy2,

+                      uint8* dst_u, uint8* dst_v, int width) {

+  // Output a row of UV values.

+  int x;

+  for (x = 0; x < width; x += 2) {

+    dst_u[0] = src_yuy2[1];

+    dst_v[0] = src_yuy2[3];

+    src_yuy2 += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+}

+// Copy row of YUY2 Y's (422) into Y (420/422).

+void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {

+  // Output a row of Y values.

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_y[x] = src_yuy2[0];

+    dst_y[x + 1] = src_yuy2[2];

+    src_yuy2 += 4;

+  }

+  if (width & 1) {

+    dst_y[width - 1] = src_yuy2[0];

+  }

+}

+// Filter 2 rows of UYVY UV's (422) into U and V (420).

+void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,

+                   uint8* dst_u, uint8* dst_v, int width) {

+  // Output a row of UV values.

+  int x;

+  for (x = 0; x < width; x += 2) {

+    dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;

+    dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;

+    src_uyvy += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+}

+// Copy row of UYVY UV's (422) into U and V (422).

+void UYVYToUV422Row_C(const uint8* src_uyvy,

+                      uint8* dst_u, uint8* dst_v, int width) {

+  // Output a row of UV values.

+  int x;

+  for (x = 0; x < width; x += 2) {

+    dst_u[0] = src_uyvy[0];

+    dst_v[0] = src_uyvy[2];

+    src_uyvy += 4;

+    dst_u += 1;

+    dst_v += 1;

+  }

+}

+// Copy row of UYVY Y's (422) into Y (420/422).

+void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {

+  // Output a row of Y values.

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_y[x] = src_uyvy[1];

+    dst_y[x + 1] = src_uyvy[3];

+    src_uyvy += 4;

+  }

+  if (width & 1) {

+    dst_y[width - 1] = src_uyvy[1];

+  }

+}

+#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f

+// Blend src_argb0 over src_argb1 and store to dst_argb.

+// dst_argb may be src_argb0 or src_argb1.

+// This code mimics the SSSE3 version for better testability.

+void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,

+                    uint8* dst_argb, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    uint32 fb = src_argb0[0];

+    uint32 fg = src_argb0[1];

+    uint32 fr = src_argb0[2];

+    uint32 a = src_argb0[3];

+    uint32 bb = src_argb1[0];

+    uint32 bg = src_argb1[1];

+    uint32 br = src_argb1[2];

+    dst_argb[0] = BLEND(fb, bb, a);

+    dst_argb[1] = BLEND(fg, bg, a);

+    dst_argb[2] = BLEND(fr, br, a);

+    dst_argb[3] = 255u;

+    fb = src_argb0[4 + 0];

+    fg = src_argb0[4 + 1];

+    fr = src_argb0[4 + 2];

+    a = src_argb0[4 + 3];

+    bb = src_argb1[4 + 0];

+    bg = src_argb1[4 + 1];

+    br = src_argb1[4 + 2];

+    dst_argb[4 + 0] = BLEND(fb, bb, a);

+    dst_argb[4 + 1] = BLEND(fg, bg, a);

+    dst_argb[4 + 2] = BLEND(fr, br, a);

+    dst_argb[4 + 3] = 255u;

+    src_argb0 += 8;

+    src_argb1 += 8;

+    dst_argb += 8;

+  }

+  if (width & 1) {

+    uint32 fb = src_argb0[0];

+    uint32 fg = src_argb0[1];

+    uint32 fr = src_argb0[2];

+    uint32 a = src_argb0[3];

+    uint32 bb = src_argb1[0];

+    uint32 bg = src_argb1[1];

+    uint32 br = src_argb1[2];

+    dst_argb[0] = BLEND(fb, bb, a);

+    dst_argb[1] = BLEND(fg, bg, a);

+    dst_argb[2] = BLEND(fr, br, a);

+    dst_argb[3] = 255u;

+  }

+}

+#undef BLEND

+#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24

+// Multiply source RGB by alpha and store to destination.

+// This code mimics the SSSE3 version for better testability.

+void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    uint32 b = src_argb[0];

+    uint32 g = src_argb[1];

+    uint32 r = src_argb[2];

+    uint32 a = src_argb[3];

+    dst_argb[0] = ATTENUATE(b, a);

+    dst_argb[1] = ATTENUATE(g, a);

+    dst_argb[2] = ATTENUATE(r, a);

+    dst_argb[3] = a;

+    b = src_argb[4];

+    g = src_argb[5];

+    r = src_argb[6];

+    a = src_argb[7];

+    dst_argb[4] = ATTENUATE(b, a);

+    dst_argb[5] = ATTENUATE(g, a);

+    dst_argb[6] = ATTENUATE(r, a);

+    dst_argb[7] = a;

+    src_argb += 8;

+    dst_argb += 8;

+  }

+  if (width & 1) {

+    const uint32 b = src_argb[0];

+    const uint32 g = src_argb[1];

+    const uint32 r = src_argb[2];

+    const uint32 a = src_argb[3];

+    dst_argb[0] = ATTENUATE(b, a);

+    dst_argb[1] = ATTENUATE(g, a);

+    dst_argb[2] = ATTENUATE(r, a);

+    dst_argb[3] = a;

+  }

+}

+#undef ATTENUATE

+// Divide source RGB by alpha and store to destination.

+// b = (b * 255 + (a / 2)) / a;

+// g = (g * 255 + (a / 2)) / a;

+// r = (r * 255 + (a / 2)) / a;

+// Reciprocal method is off by 1 on some values. ie 125

+// 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.

+#define T(a) 0x01000000 + (0x10000 / a)

+const uint32 fixed_invtbl8[256] = {

+  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),

+  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),

+  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),

+  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),

+  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),

+  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),

+  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),

+  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),

+  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),

+  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),

+  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),

+  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),

+  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),

+  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),

+  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),

+  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),

+  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),

+  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),

+  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),

+  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),

+  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),

+  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),

+  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),

+  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),

+  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),

+  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),

+  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),

+  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),

+  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),

+  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),

+  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),

+  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };

+#undef T

+void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    uint32 b = src_argb[0];

+    uint32 g = src_argb[1];

+    uint32 r = src_argb[2];

+    const uint32 a = src_argb[3];

+    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point

+    b = (b * ia) >> 8;

+    g = (g * ia) >> 8;

+    r = (r * ia) >> 8;

+    // Clamping should not be necessary but is free in assembly.

+    dst_argb[0] = clamp255(b);

+    dst_argb[1] = clamp255(g);

+    dst_argb[2] = clamp255(r);

+    dst_argb[3] = a;

+    src_argb += 4;

+    dst_argb += 4;

+  }

+}

+void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,

+                               const int32* previous_cumsum, int width) {

+  int32 row_sum[4] = {0, 0, 0, 0};

+  int x;

+  for (x = 0; x < width; ++x) {

+    row_sum[0] += row[x * 4 + 0];

+    row_sum[1] += row[x * 4 + 1];

+    row_sum[2] += row[x * 4 + 2];

+    row_sum[3] += row[x * 4 + 3];

+    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];

+    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];

+    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];

+    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];

+  }

+}

+void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,

+                                int w, int area, uint8* dst, int count) {

+  float ooa = 1.0f / area;

+  int i;

+  for (i = 0; i < count; ++i) {

+    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);

+    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);

+    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);

+    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);

+    dst += 4;

+    tl += 4;

+    bl += 4;

+  }

+}

+// Copy pixels from rotated source to destination row with a slope.

+LIBYUV_API

+void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,

+                     uint8* dst_argb, const float* uv_dudv, int width) {

+  int i;

+  // Render a row of pixels from source into a buffer.

+  float uv[2];

+  uv[0] = uv_dudv[0];

+  uv[1] = uv_dudv[1];

+  for (i = 0; i < width; ++i) {

+    int x = (int)(uv[0]);

+    int y = (int)(uv[1]);

+    *(uint32*)(dst_argb) =

+        *(const uint32*)(src_argb + y * src_argb_stride +

+                                         x * 4);

+    dst_argb += 4;

+    uv[0] += uv_dudv[2];

+    uv[1] += uv_dudv[3];

+  }

+}

+// Blend 2 rows into 1 for conversions such as I422ToI420.

+void HalfRow_C(const uint8* src_uv, int src_uv_stride,

+               uint8* dst_uv, int pix) {

+  int x;

+  for (x = 0; x < pix; ++x) {

+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;

+  }

+}

+void HalfRow_16_C(const uint16* src_uv, int src_uv_stride,

+                  uint16* dst_uv, int pix) {

+  int x;

+  for (x = 0; x < pix; ++x) {

+    dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;

+  }

+}

+// C version 2x2 -> 2x1.

+void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,

+                      ptrdiff_t src_stride,

+                      int width, int source_y_fraction) {

+  int y1_fraction = source_y_fraction;

+  int y0_fraction = 256 - y1_fraction;

+  const uint8* src_ptr1 = src_ptr + src_stride;

+  int x;

+  if (source_y_fraction == 0) {

+    memcpy(dst_ptr, src_ptr, width);

+    return;

+  }

+  if (source_y_fraction == 128) {

+    HalfRow_C(src_ptr, (int)(src_stride), dst_ptr, width);

+    return;

+  }

+  for (x = 0; x < width - 1; x += 2) {

+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;

+    src_ptr += 2;

+    src_ptr1 += 2;

+    dst_ptr += 2;

+  }

+  if (width & 1) {

+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

+  }

+}

+void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                         ptrdiff_t src_stride,

+                         int width, int source_y_fraction) {

+  int y1_fraction = source_y_fraction;

+  int y0_fraction = 256 - y1_fraction;

+  const uint16* src_ptr1 = src_ptr + src_stride;

+  int x;

+  if (source_y_fraction == 0) {

+    memcpy(dst_ptr, src_ptr, width * 2);

+    return;

+  }

+  if (source_y_fraction == 128) {

+    HalfRow_16_C(src_ptr, (int)(src_stride), dst_ptr, width);

+    return;

+  }

+  for (x = 0; x < width - 1; x += 2) {

+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

+    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;

+    src_ptr += 2;

+    src_ptr1 += 2;

+    dst_ptr += 2;

+  }

+  if (width & 1) {

+    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

+  }

+}

+// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG

+void ARGBToBayerRow_C(const uint8* src_argb,

+                      uint8* dst_bayer, uint32 selector, int pix) {

+  int index0 = selector & 0xff;

+  int index1 = (selector >> 8) & 0xff;

+  // Copy a row of Bayer.

+  int x;

+  for (x = 0; x < pix - 1; x += 2) {

+    dst_bayer[0] = src_argb[index0];

+    dst_bayer[1] = src_argb[index1];

+    src_argb += 8;

+    dst_bayer += 2;

+  }

+  if (pix & 1) {

+    dst_bayer[0] = src_argb[index0];

+  }

+}

+// Select G channel from ARGB.  e.g.  GGGGGGGG

+void ARGBToBayerGGRow_C(const uint8* src_argb,

+                        uint8* dst_bayer, uint32 selector, int pix) {

+  // Copy a row of G.

+  int x;

+  for (x = 0; x < pix - 1; x += 2) {

+    dst_bayer[0] = src_argb[1];

+    dst_bayer[1] = src_argb[5];

+    src_argb += 8;

+    dst_bayer += 2;

+  }

+  if (pix & 1) {

+    dst_bayer[0] = src_argb[1];

+  }

+}

+// Use first 4 shuffler values to reorder ARGB channels.

+void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,

+                      const uint8* shuffler, int pix) {

+  int index0 = shuffler[0];

+  int index1 = shuffler[1];

+  int index2 = shuffler[2];

+  int index3 = shuffler[3];

+  // Shuffle a row of ARGB.

+  int x;

+  for (x = 0; x < pix; ++x) {

+    // To support in-place conversion.

+    uint8 b = src_argb[index0];

+    uint8 g = src_argb[index1];

+    uint8 r = src_argb[index2];

+    uint8 a = src_argb[index3];

+    dst_argb[0] = b;

+    dst_argb[1] = g;

+    dst_argb[2] = r;

+    dst_argb[3] = a;

+    src_argb += 4;

+    dst_argb += 4;

+  }

+}

+void I422ToYUY2Row_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_frame, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_frame[0] = src_y[0];

+    dst_frame[1] = src_u[0];

+    dst_frame[2] = src_y[1];

+    dst_frame[3] = src_v[0];

+    dst_frame += 4;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+  }

+  if (width & 1) {

+    dst_frame[0] = src_y[0];

+    dst_frame[1] = src_u[0];

+    dst_frame[2] = src_y[0];  // duplicate last y

+    dst_frame[3] = src_v[0];

+  }

+}

+void I422ToUYVYRow_C(const uint8* src_y,

+                     const uint8* src_u,

+                     const uint8* src_v,

+                     uint8* dst_frame, int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_frame[0] = src_u[0];

+    dst_frame[1] = src_y[0];

+    dst_frame[2] = src_v[0];

+    dst_frame[3] = src_y[1];

+    dst_frame += 4;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+  }

+  if (width & 1) {

+    dst_frame[0] = src_u[0];

+    dst_frame[1] = src_y[0];

+    dst_frame[2] = src_v[0];

+    dst_frame[3] = src_y[0];  // duplicate last y

+  }

+}

+#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)

+// row_win.cc has asm version, but GCC uses 2 step wrapper.

+#if !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))

+void I422ToRGB565Row_SSSE3(const uint8* src_y,

+                           const uint8* src_u,

+                           const uint8* src_v,

+                           uint8* rgb_buf,

+                           int width) {

+  // Allocate a row of ARGB.

+  align_buffer_64(row, width * 4);

+  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);

+  ARGBToRGB565Row_SSE2(row, rgb_buf, width);

+  free_aligned_buffer_64(row);

+}

+#endif  // !defined(_MSC_VER) && (defined(__x86_64__) || defined(__i386__))

+#if defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)

+void I422ToARGB1555Row_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* rgb_buf,

+                             int width) {

+  // Allocate a row of ARGB.

+  align_buffer_64(row, width * 4);

+  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);

+  ARGBToARGB1555Row_SSE2(row, rgb_buf, width);

+  free_aligned_buffer_64(row);

+}

+void I422ToARGB4444Row_SSSE3(const uint8* src_y,

+                             const uint8* src_u,

+                             const uint8* src_v,

+                             uint8* rgb_buf,

+                             int width) {

+  // Allocate a row of ARGB.

+  align_buffer_64(row, width * 4);

+  I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, width);

+  ARGBToARGB4444Row_SSE2(row, rgb_buf, width);

+  free_aligned_buffer_64(row);

+}

+void NV12ToRGB565Row_SSSE3(const uint8* src_y,

+                           const uint8* src_uv,

+                           uint8* dst_rgb565,

+                           int width) {

+  // Allocate a row of ARGB.

+  align_buffer_64(row, width * 4);

+  NV12ToARGBRow_SSSE3(src_y, src_uv, row, width);

+  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);

+  free_aligned_buffer_64(row);

+}

+void NV21ToRGB565Row_SSSE3(const uint8* src_y,

+                           const uint8* src_vu,

+                           uint8* dst_rgb565,

+                           int width) {

+  // Allocate a row of ARGB.

+  align_buffer_64(row, width * 4);

+  NV21ToARGBRow_SSSE3(src_y, src_vu, row, width);

+  ARGBToRGB565Row_SSE2(row, dst_rgb565, width);

+  free_aligned_buffer_64(row);

+}

+void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,

+                         uint8* dst_argb,

+                         int width) {

+  // Allocate a rows of yuv.

+  align_buffer_64(row_y, ((width + 63) & ~63) * 2);

+  uint8* row_u = row_y + ((width + 63) & ~63);

+  uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+  YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, width);

+  YUY2ToYRow_SSE2(src_yuy2, row_y, width);

+  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);

+  free_aligned_buffer_64(row_y);

+}

+void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2,

+                                   uint8* dst_argb,

+                                   int width) {

+  // Allocate a rows of yuv.

+  align_buffer_64(row_y, ((width + 63) & ~63) * 2);

+  uint8* row_u = row_y + ((width + 63) & ~63);

+  uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+  YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, row_u, row_v, width);

+  YUY2ToYRow_Unaligned_SSE2(src_yuy2, row_y, width);

+  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);

+  free_aligned_buffer_64(row_y);

+}

+void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,

+                         uint8* dst_argb,

+                         int width) {

+  // Allocate a rows of yuv.

+  align_buffer_64(row_y, ((width + 63) & ~63) * 2);

+  uint8* row_u = row_y + ((width + 63) & ~63);

+  uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+  UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, width);

+  UYVYToYRow_SSE2(src_uyvy, row_y, width);

+  I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, width);

+  free_aligned_buffer_64(row_y);

+}

+void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,

+                                   uint8* dst_argb,

+                                   int width) {

+  // Allocate a rows of yuv.

+  align_buffer_64(row_y, ((width + 63) & ~63) * 2);

+  uint8* row_u = row_y + ((width + 63) & ~63);

+  uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+  UYVYToUV422Row_Unaligned_SSE2(src_uyvy, row_u, row_v, width);

+  UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);

+  I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);

+  free_aligned_buffer_64(row_y);

+}

+#endif  // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)

+#endif  // !defined(LIBYUV_DISABLE_X86)

+void ARGBPolynomialRow_C(const uint8* src_argb,

+                         uint8* dst_argb, const float* poly,

+                         int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    float b = (float)(src_argb[0]);

+    float g = (float)(src_argb[1]);

+    float r = (float)(src_argb[2]);

+    float a = (float)(src_argb[3]);

+    float b2 = b * b;

+    float g2 = g * g;

+    float r2 = r * r;

+    float a2 = a * a;

+    float db = poly[0] + poly[4] * b;

+    float dg = poly[1] + poly[5] * g;

+    float dr = poly[2] + poly[6] * r;

+    float da = poly[3] + poly[7] * a;

+    float b3 = b2 * b;

+    float g3 = g2 * g;

+    float r3 = r2 * r;

+    float a3 = a2 * a;

+    db += poly[8] * b2;

+    dg += poly[9] * g2;

+    dr += poly[10] * r2;

+    da += poly[11] * a2;

+    db += poly[12] * b3;

+    dg += poly[13] * g3;

+    dr += poly[14] * r3;

+    da += poly[15] * a3;

+    dst_argb[0] = Clamp((int32)(db));

+    dst_argb[1] = Clamp((int32)(dg));

+    dst_argb[2] = Clamp((int32)(dr));

+    dst_argb[3] = Clamp((int32)(da));

+    src_argb += 4;

+    dst_argb += 4;

+  }

+}

+void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,

+                             const uint8* luma, uint32 lumacoeff) {

+  uint32 bc = lumacoeff & 0xff;

+  uint32 gc = (lumacoeff >> 8) & 0xff;

+  uint32 rc = (lumacoeff >> 16) & 0xff;

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    // Luminance in rows, color values in columns.

+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

+                           src_argb[2] * rc) & 0x7F00u) + luma;

+    const uint8* luma1;

+    dst_argb[0] = luma0[src_argb[0]];

+    dst_argb[1] = luma0[src_argb[1]];

+    dst_argb[2] = luma0[src_argb[2]];

+    dst_argb[3] = src_argb[3];

+    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +

+              src_argb[6] * rc) & 0x7F00u) + luma;

+    dst_argb[4] = luma1[src_argb[4]];

+    dst_argb[5] = luma1[src_argb[5]];

+    dst_argb[6] = luma1[src_argb[6]];

+    dst_argb[7] = src_argb[7];

+    src_argb += 8;

+    dst_argb += 8;

+  }

+  if (width & 1) {

+    // Luminance in rows, color values in columns.

+    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

+                           src_argb[2] * rc) & 0x7F00u) + luma;

+    dst_argb[0] = luma0[src_argb[0]];

+    dst_argb[1] = luma0[src_argb[1]];

+    dst_argb[2] = luma0[src_argb[2]];

+    dst_argb[3] = src_argb[3];

+  }

+}

+void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    dst[3] = src[3];

+    dst[7] = src[7];

+    dst += 8;

+    src += 8;

+  }

+  if (width & 1) {

+    dst[3] = src[3];

+  }

+}

+void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {

+  int i;

+  for (i = 0; i < width - 1; i += 2) {

+    dst[3] = src[0];

+    dst[7] = src[1];

+    dst += 8;

+    src += 2;

+  }

+  if (width & 1) {

+    dst[3] = src[0];

+  }

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/row_mips.cc

@@ -1,0 +1,991 @@

+/*

+ *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// The following are available on Mips platforms:

+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)

+#ifdef HAS_COPYROW_MIPS

+void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {

+  __asm__ __volatile__ (

+    ".set      noreorder                         \n"

+    ".set      noat                              \n"

+    "slti      $at, %[count], 8                  \n"

+    "bne       $at ,$zero, $last8                \n"

+    "xor       $t8, %[src], %[dst]               \n"

+    "andi      $t8, $t8, 0x3                     \n"

+    "bne       $t8, $zero, unaligned             \n"

+    "negu      $a3, %[dst]                       \n"

+    // make dst/src aligned

+    "andi      $a3, $a3, 0x3                     \n"

+    "beq       $a3, $zero, $chk16w               \n"

+    // word-aligned now count is the remining bytes count

+    "subu     %[count], %[count], $a3            \n"

+    "lwr       $t8, 0(%[src])                    \n"

+    "addu      %[src], %[src], $a3               \n"

+    "swr       $t8, 0(%[dst])                    \n"

+    "addu      %[dst], %[dst], $a3               \n"

+    // Now the dst/src are mutually word-aligned with word-aligned addresses

+    "$chk16w:                                    \n"

+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?

+    // t8 is the byte count after 64-byte chunks

+    "beq       %[count], $t8, chk8w              \n"

+    // There will be at most 1 32-byte chunk after it

+    "subu      $a3, %[count], $t8                \n"  // the reminder

+    // Here a3 counts bytes in 16w chunks

+    "addu      $a3, %[dst], $a3                  \n"

+    // Now a3 is the final dst after 64-byte chunks

+    "addu      $t0, %[dst], %[count]             \n"

+    // t0 is the "past the end" address

+    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past

+    // the "t0-32" address

+    // This means: for x=128 the last "safe" a1 address is "t0-160"

+    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"

+    // we will use "pref 30,128(a1)", so "t0-160" is the limit

+    "subu      $t9, $t0, 160                     \n"

+    // t9 is the "last safe pref 30,128(a1)" address

+    "pref      0, 0(%[src])                      \n"  // first line of src

+    "pref      0, 32(%[src])                     \n"  // second line of src

+    "pref      0, 64(%[src])                     \n"

+    "pref      30, 32(%[dst])                    \n"

+    // In case the a1 > t9 don't use "pref 30" at all

+    "sgtu      $v1, %[dst], $t9                  \n"

+    "bgtz      $v1, $loop16w                     \n"

+    "nop                                         \n"

+    // otherwise, start with using pref30

+    "pref      30, 64(%[dst])                    \n"

+    "$loop16w:                                    \n"

+    "pref      0, 96(%[src])                     \n"

+    "lw        $t0, 0(%[src])                    \n"

+    "bgtz      $v1, $skip_pref30_96              \n"  // skip

+    "lw        $t1, 4(%[src])                    \n"

+    "pref      30, 96(%[dst])                    \n"  // continue

+    "$skip_pref30_96:                            \n"

+    "lw        $t2, 8(%[src])                    \n"

+    "lw        $t3, 12(%[src])                   \n"

+    "lw        $t4, 16(%[src])                   \n"

+    "lw        $t5, 20(%[src])                   \n"

+    "lw        $t6, 24(%[src])                   \n"

+    "lw        $t7, 28(%[src])                   \n"

+    "pref      0, 128(%[src])                    \n"

+    //  bring the next lines of src, addr 128

+    "sw        $t0, 0(%[dst])                    \n"

+    "sw        $t1, 4(%[dst])                    \n"

+    "sw        $t2, 8(%[dst])                    \n"

+    "sw        $t3, 12(%[dst])                   \n"

+    "sw        $t4, 16(%[dst])                   \n"

+    "sw        $t5, 20(%[dst])                   \n"

+    "sw        $t6, 24(%[dst])                   \n"

+    "sw        $t7, 28(%[dst])                   \n"

+    "lw        $t0, 32(%[src])                   \n"

+    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)

+    "lw        $t1, 36(%[src])                   \n"

+    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128

+    "$skip_pref30_128:                           \n"

+    "lw        $t2, 40(%[src])                   \n"

+    "lw        $t3, 44(%[src])                   \n"

+    "lw        $t4, 48(%[src])                   \n"

+    "lw        $t5, 52(%[src])                   \n"

+    "lw        $t6, 56(%[src])                   \n"

+    "lw        $t7, 60(%[src])                   \n"

+    "pref      0, 160(%[src])                    \n"

+    // bring the next lines of src, addr 160

+    "sw        $t0, 32(%[dst])                   \n"

+    "sw        $t1, 36(%[dst])                   \n"

+    "sw        $t2, 40(%[dst])                   \n"

+    "sw        $t3, 44(%[dst])                   \n"

+    "sw        $t4, 48(%[dst])                   \n"

+    "sw        $t5, 52(%[dst])                   \n"

+    "sw        $t6, 56(%[dst])                   \n"

+    "sw        $t7, 60(%[dst])                   \n"

+    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest

+    "sgtu      $v1, %[dst], $t9                  \n"

+    "bne       %[dst], $a3, $loop16w             \n"

+    " addiu    %[src], %[src], 64                \n"  // adding 64 to src

+    "move      %[count], $t8                     \n"

+    // Here we have src and dest word-aligned but less than 64-bytes to go

+    "chk8w:                                      \n"

+    "pref      0, 0x0(%[src])                    \n"

+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?

+    // the t8 is the reminder count past 32-bytes

+    "beq       %[count], $t8, chk1w              \n"

+    // count=t8,no 32-byte chunk

+    " nop                                        \n"

+    "lw        $t0, 0(%[src])                    \n"

+    "lw        $t1, 4(%[src])                    \n"

+    "lw        $t2, 8(%[src])                    \n"

+    "lw        $t3, 12(%[src])                   \n"

+    "lw        $t4, 16(%[src])                   \n"

+    "lw        $t5, 20(%[src])                   \n"

+    "lw        $t6, 24(%[src])                   \n"

+    "lw        $t7, 28(%[src])                   \n"

+    "addiu     %[src], %[src], 32                \n"

+    "sw        $t0, 0(%[dst])                    \n"

+    "sw        $t1, 4(%[dst])                    \n"

+    "sw        $t2, 8(%[dst])                    \n"

+    "sw        $t3, 12(%[dst])                   \n"

+    "sw        $t4, 16(%[dst])                   \n"

+    "sw        $t5, 20(%[dst])                   \n"

+    "sw        $t6, 24(%[dst])                   \n"

+    "sw        $t7, 28(%[dst])                   \n"

+    "addiu     %[dst], %[dst], 32                \n"

+    "chk1w:                                      \n"

+    "andi      %[count], $t8, 0x3                \n"

+    // now count is the reminder past 1w chunks

+    "beq       %[count], $t8, $last8             \n"

+    " subu     $a3, $t8, %[count]                \n"

+    // a3 is count of bytes in 1w chunks

+    "addu      $a3, %[dst], $a3                  \n"

+    // now a3 is the dst address past the 1w chunks

+    // copying in words (4-byte chunks)

+    "$wordCopy_loop:                             \n"

+    "lw        $t3, 0(%[src])                    \n"

+    // the first t3 may be equal t0 ... optimize?

+    "addiu     %[src], %[src],4                  \n"

+    "addiu     %[dst], %[dst],4                  \n"

+    "bne       %[dst], $a3,$wordCopy_loop        \n"

+    " sw       $t3, -4(%[dst])                   \n"

+    // For the last (<8) bytes

+    "$last8:                                     \n"

+    "blez      %[count], leave                   \n"

+    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address

+    "$last8loop:                                 \n"

+    "lb        $v1, 0(%[src])                    \n"

+    "addiu     %[src], %[src], 1                 \n"

+    "addiu     %[dst], %[dst], 1                 \n"

+    "bne       %[dst], $a3, $last8loop           \n"

+    " sb       $v1, -1(%[dst])                   \n"

+    "leave:                                      \n"

+    "  j       $ra                               \n"

+    "  nop                                       \n"

+    //

+    // UNALIGNED case

+    //

+    "unaligned:                                  \n"

+    // got here with a3="negu a1"

+    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?

+    "beqz      $a3, $ua_chk16w                   \n"

+    " subu     %[count], %[count], $a3           \n"

+    // bytes left after initial a3 bytes

+    "lwr       $v1, 0(%[src])                    \n"

+    "lwl       $v1, 3(%[src])                    \n"

+    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3

+    "swr       $v1, 0(%[dst])                    \n"

+    "addu      %[dst], %[dst], $a3               \n"

+    // below the dst will be word aligned (NOTE1)

+    "$ua_chk16w:                                 \n"

+    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?

+    // t8 is the byte count after 64-byte chunks

+    "beq       %[count], $t8, ua_chk8w           \n"

+    // if a2==t8, no 64-byte chunks

+    // There will be at most 1 32-byte chunk after it

+    "subu      $a3, %[count], $t8                \n"  // the reminder

+    // Here a3 counts bytes in 16w chunks

+    "addu      $a3, %[dst], $a3                  \n"

+    // Now a3 is the final dst after 64-byte chunks

+    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"

+    "subu      $t9, $t0, 160                     \n"

+    // t9 is the "last safe pref 30,128(a1)" address

+    "pref      0, 0(%[src])                      \n"  // first line of src

+    "pref      0, 32(%[src])                     \n"  // second line  addr 32

+    "pref      0, 64(%[src])                     \n"

+    "pref      30, 32(%[dst])                    \n"

+    // safe, as we have at least 64 bytes ahead

+    // In case the a1 > t9 don't use "pref 30" at all

+    "sgtu      $v1, %[dst], $t9                  \n"

+    "bgtz      $v1, $ua_loop16w                  \n"

+    // skip "pref 30,64(a1)" for too short arrays

+    " nop                                        \n"

+    // otherwise, start with using pref30

+    "pref      30, 64(%[dst])                    \n"

+    "$ua_loop16w:                                \n"

+    "pref      0, 96(%[src])                     \n"

+    "lwr       $t0, 0(%[src])                    \n"

+    "lwl       $t0, 3(%[src])                    \n"

+    "lwr       $t1, 4(%[src])                    \n"

+    "bgtz      $v1, $ua_skip_pref30_96           \n"

+    " lwl      $t1, 7(%[src])                    \n"

+    "pref      30, 96(%[dst])                    \n"

+    // continue setting up the dest, addr 96

+    "$ua_skip_pref30_96:                         \n"

+    "lwr       $t2, 8(%[src])                    \n"

+    "lwl       $t2, 11(%[src])                   \n"

+    "lwr       $t3, 12(%[src])                   \n"

+    "lwl       $t3, 15(%[src])                   \n"

+    "lwr       $t4, 16(%[src])                   \n"

+    "lwl       $t4, 19(%[src])                   \n"

+    "lwr       $t5, 20(%[src])                   \n"

+    "lwl       $t5, 23(%[src])                   \n"

+    "lwr       $t6, 24(%[src])                   \n"

+    "lwl       $t6, 27(%[src])                   \n"

+    "lwr       $t7, 28(%[src])                   \n"

+    "lwl       $t7, 31(%[src])                   \n"

+    "pref      0, 128(%[src])                    \n"

+    // bring the next lines of src, addr 128

+    "sw        $t0, 0(%[dst])                    \n"

+    "sw        $t1, 4(%[dst])                    \n"

+    "sw        $t2, 8(%[dst])                    \n"

+    "sw        $t3, 12(%[dst])                   \n"

+    "sw        $t4, 16(%[dst])                   \n"

+    "sw        $t5, 20(%[dst])                   \n"

+    "sw        $t6, 24(%[dst])                   \n"

+    "sw        $t7, 28(%[dst])                   \n"

+    "lwr       $t0, 32(%[src])                   \n"

+    "lwl       $t0, 35(%[src])                   \n"

+    "lwr       $t1, 36(%[src])                   \n"

+    "bgtz      $v1, ua_skip_pref30_128           \n"

+    " lwl      $t1, 39(%[src])                   \n"

+    "pref      30, 128(%[dst])                   \n"

+    // continue setting up the dest, addr 128

+    "ua_skip_pref30_128:                         \n"

+    "lwr       $t2, 40(%[src])                   \n"

+    "lwl       $t2, 43(%[src])                   \n"

+    "lwr       $t3, 44(%[src])                   \n"

+    "lwl       $t3, 47(%[src])                   \n"

+    "lwr       $t4, 48(%[src])                   \n"

+    "lwl       $t4, 51(%[src])                   \n"

+    "lwr       $t5, 52(%[src])                   \n"

+    "lwl       $t5, 55(%[src])                   \n"

+    "lwr       $t6, 56(%[src])                   \n"

+    "lwl       $t6, 59(%[src])                   \n"

+    "lwr       $t7, 60(%[src])                   \n"

+    "lwl       $t7, 63(%[src])                   \n"

+    "pref      0, 160(%[src])                    \n"

+    // bring the next lines of src, addr 160

+    "sw        $t0, 32(%[dst])                   \n"

+    "sw        $t1, 36(%[dst])                   \n"

+    "sw        $t2, 40(%[dst])                   \n"

+    "sw        $t3, 44(%[dst])                   \n"

+    "sw        $t4, 48(%[dst])                   \n"

+    "sw        $t5, 52(%[dst])                   \n"

+    "sw        $t6, 56(%[dst])                   \n"

+    "sw        $t7, 60(%[dst])                   \n"

+    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest

+    "sgtu      $v1,%[dst],$t9                    \n"

+    "bne       %[dst],$a3,$ua_loop16w            \n"

+    " addiu    %[src],%[src],64                  \n"  // adding 64 to src

+    "move      %[count],$t8                      \n"

+    // Here we have src and dest word-aligned but less than 64-bytes to go

+    "ua_chk8w:                                   \n"

+    "pref      0, 0x0(%[src])                    \n"

+    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?

+    // the t8 is the reminder count

+    "beq       %[count], $t8, $ua_chk1w          \n"

+    // when count==t8, no 32-byte chunk

+    "lwr       $t0, 0(%[src])                    \n"

+    "lwl       $t0, 3(%[src])                    \n"

+    "lwr       $t1, 4(%[src])                    \n"

+    "lwl       $t1, 7(%[src])                    \n"

+    "lwr       $t2, 8(%[src])                    \n"

+    "lwl       $t2, 11(%[src])                   \n"

+    "lwr       $t3, 12(%[src])                   \n"

+    "lwl       $t3, 15(%[src])                   \n"

+    "lwr       $t4, 16(%[src])                   \n"

+    "lwl       $t4, 19(%[src])                   \n"

+    "lwr       $t5, 20(%[src])                   \n"

+    "lwl       $t5, 23(%[src])                   \n"

+    "lwr       $t6, 24(%[src])                   \n"

+    "lwl       $t6, 27(%[src])                   \n"

+    "lwr       $t7, 28(%[src])                   \n"

+    "lwl       $t7, 31(%[src])                   \n"

+    "addiu     %[src], %[src], 32                \n"

+    "sw        $t0, 0(%[dst])                    \n"

+    "sw        $t1, 4(%[dst])                    \n"

+    "sw        $t2, 8(%[dst])                    \n"

+    "sw        $t3, 12(%[dst])                   \n"

+    "sw        $t4, 16(%[dst])                   \n"

+    "sw        $t5, 20(%[dst])                   \n"

+    "sw        $t6, 24(%[dst])                   \n"

+    "sw        $t7, 28(%[dst])                   \n"

+    "addiu     %[dst], %[dst], 32                \n"

+    "$ua_chk1w:                                  \n"

+    "andi      %[count], $t8, 0x3                \n"

+    // now count is the reminder past 1w chunks

+    "beq       %[count], $t8, ua_smallCopy       \n"

+    "subu      $a3, $t8, %[count]                \n"

+    // a3 is count of bytes in 1w chunks

+    "addu      $a3, %[dst], $a3                  \n"

+    // now a3 is the dst address past the 1w chunks

+    // copying in words (4-byte chunks)

+    "$ua_wordCopy_loop:                          \n"

+    "lwr       $v1, 0(%[src])                    \n"

+    "lwl       $v1, 3(%[src])                    \n"

+    "addiu     %[src], %[src], 4                 \n"

+    "addiu     %[dst], %[dst], 4                 \n"

+    // note: dst=a1 is word aligned here, see NOTE1

+    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"

+    " sw       $v1,-4(%[dst])                    \n"

+    // Now less than 4 bytes (value in count) left to copy

+    "ua_smallCopy:                               \n"

+    "beqz      %[count], leave                   \n"

+    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address

+    "$ua_smallCopy_loop:                         \n"

+    "lb        $v1, 0(%[src])                    \n"

+    "addiu     %[src], %[src], 1                 \n"

+    "addiu     %[dst], %[dst], 1                 \n"

+    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"

+    " sb       $v1, -1(%[dst])                   \n"

+    "j         $ra                               \n"

+    " nop                                        \n"

+    ".set      at                                \n"

+    ".set      reorder                           \n"

+       : [dst] "+r" (dst), [src] "+r" (src)

+       : [count] "r" (count)

+       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",

+       "t8", "t9", "a3", "v1", "at"

+  );

+}

+#endif  // HAS_COPYROW_MIPS

+// MIPS DSPR2 functions

+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \

+    (__mips_dsp_rev >= 2)

+void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                           int width) {

+  __asm__ __volatile__ (

+    ".set push                                     \n"

+    ".set noreorder                                \n"

+    "srl             $t4, %[width], 4              \n"  // multiplies of 16

+    "blez            $t4, 2f                       \n"

+    " andi           %[width], %[width], 0xf       \n"  // residual

+    ".p2align        2                             \n"

+  "1:                                              \n"

+    "addiu           $t4, $t4, -1                  \n"

+    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0

+    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2

+    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4

+    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6

+    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8

+    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10

+    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12

+    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14

+    "addiu           %[src_uv], %[src_uv], 32      \n"

+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0

+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0

+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4

+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4

+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8

+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8

+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12

+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12

+    "sw              $t9, 0(%[dst_v])              \n"

+    "sw              $t0, 0(%[dst_u])              \n"

+    "sw              $t1, 4(%[dst_v])              \n"

+    "sw              $t2, 4(%[dst_u])              \n"

+    "sw              $t3, 8(%[dst_v])              \n"

+    "sw              $t5, 8(%[dst_u])              \n"

+    "sw              $t6, 12(%[dst_v])             \n"

+    "sw              $t7, 12(%[dst_u])             \n"

+    "addiu           %[dst_v], %[dst_v], 16        \n"

+    "bgtz            $t4, 1b                       \n"

+    " addiu          %[dst_u], %[dst_u], 16        \n"

+    "beqz            %[width], 3f                  \n"

+    " nop                                          \n"

+  "2:                                              \n"

+    "lbu             $t0, 0(%[src_uv])             \n"

+    "lbu             $t1, 1(%[src_uv])             \n"

+    "addiu           %[src_uv], %[src_uv], 2       \n"

+    "addiu           %[width], %[width], -1        \n"

+    "sb              $t0, 0(%[dst_u])              \n"

+    "sb              $t1, 0(%[dst_v])              \n"

+    "addiu           %[dst_u], %[dst_u], 1         \n"

+    "bgtz            %[width], 2b                  \n"

+    " addiu          %[dst_v], %[dst_v], 1         \n"

+  "3:                                              \n"

+    ".set pop                                      \n"

+     : [src_uv] "+r" (src_uv),

+       [width] "+r" (width),

+       [dst_u] "+r" (dst_u),

+       [dst_v] "+r" (dst_v)

+     :

+     : "t0", "t1", "t2", "t3",

+     "t4", "t5", "t6", "t7", "t8", "t9"

+  );

+}

+void SplitUVRow_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,

+                                     uint8* dst_v, int width) {

+  __asm__ __volatile__ (

+    ".set push                                     \n"

+    ".set noreorder                                \n"

+    "srl             $t4, %[width], 4              \n"  // multiplies of 16

+    "blez            $t4, 2f                       \n"

+    " andi           %[width], %[width], 0xf       \n"  // residual

+    ".p2align        2                             \n"

+  "1:                                              \n"

+    "addiu           $t4, $t4, -1                  \n"

+    "lwr             $t0, 0(%[src_uv])             \n"

+    "lwl             $t0, 3(%[src_uv])             \n"  // V1 | U1 | V0 | U0

+    "lwr             $t1, 4(%[src_uv])             \n"

+    "lwl             $t1, 7(%[src_uv])             \n"  // V3 | U3 | V2 | U2

+    "lwr             $t2, 8(%[src_uv])             \n"

+    "lwl             $t2, 11(%[src_uv])            \n"  // V5 | U5 | V4 | U4

+    "lwr             $t3, 12(%[src_uv])            \n"

+    "lwl             $t3, 15(%[src_uv])            \n"  // V7 | U7 | V6 | U6

+    "lwr             $t5, 16(%[src_uv])            \n"

+    "lwl             $t5, 19(%[src_uv])            \n"  // V9 | U9 | V8 | U8

+    "lwr             $t6, 20(%[src_uv])            \n"

+    "lwl             $t6, 23(%[src_uv])            \n"  // V11 | U11 | V10 | U10

+    "lwr             $t7, 24(%[src_uv])            \n"

+    "lwl             $t7, 27(%[src_uv])            \n"  // V13 | U13 | V12 | U12

+    "lwr             $t8, 28(%[src_uv])            \n"

+    "lwl             $t8, 31(%[src_uv])            \n"  // V15 | U15 | V14 | U14

+    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0

+    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0

+    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4

+    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4

+    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8

+    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8

+    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12

+    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12

+    "addiu           %[src_uv], %[src_uv], 32      \n"

+    "swr             $t9, 0(%[dst_v])              \n"

+    "swl             $t9, 3(%[dst_v])              \n"

+    "swr             $t0, 0(%[dst_u])              \n"

+    "swl             $t0, 3(%[dst_u])              \n"

+    "swr             $t1, 4(%[dst_v])              \n"

+    "swl             $t1, 7(%[dst_v])              \n"

+    "swr             $t2, 4(%[dst_u])              \n"

+    "swl             $t2, 7(%[dst_u])              \n"

+    "swr             $t3, 8(%[dst_v])              \n"

+    "swl             $t3, 11(%[dst_v])             \n"

+    "swr             $t5, 8(%[dst_u])              \n"

+    "swl             $t5, 11(%[dst_u])             \n"

+    "swr             $t6, 12(%[dst_v])             \n"

+    "swl             $t6, 15(%[dst_v])             \n"

+    "swr             $t7, 12(%[dst_u])             \n"

+    "swl             $t7, 15(%[dst_u])             \n"

+    "addiu           %[dst_u], %[dst_u], 16        \n"

+    "bgtz            $t4, 1b                       \n"

+    " addiu          %[dst_v], %[dst_v], 16        \n"

+    "beqz            %[width], 3f                  \n"

+    " nop                                          \n"

+  "2:                                              \n"

+    "lbu             $t0, 0(%[src_uv])             \n"

+    "lbu             $t1, 1(%[src_uv])             \n"

+    "addiu           %[src_uv], %[src_uv], 2       \n"

+    "addiu           %[width], %[width], -1        \n"

+    "sb              $t0, 0(%[dst_u])              \n"

+    "sb              $t1, 0(%[dst_v])              \n"

+    "addiu           %[dst_u], %[dst_u], 1         \n"

+    "bgtz            %[width], 2b                  \n"

+    " addiu          %[dst_v], %[dst_v], 1         \n"

+  "3:                                              \n"

+    ".set pop                                      \n"

+     : [src_uv] "+r" (src_uv),

+       [width] "+r" (width),

+       [dst_u] "+r" (dst_u),

+       [dst_v] "+r" (dst_v)

+     :

+     : "t0", "t1", "t2", "t3",

+     "t4", "t5", "t6", "t7", "t8", "t9"

+  );

+}

+void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {

+  __asm__ __volatile__ (

+    ".set push                             \n"

+    ".set noreorder                        \n"

+    "srl       $t4, %[width], 4            \n"  // multiplies of 16

+    "andi      $t5, %[width], 0xf          \n"

+    "blez      $t4, 2f                     \n"

+    " addu     %[src], %[src], %[width]    \n"  // src += width

+    ".p2align  2                           \n"

+   "1:                                     \n"

+    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|

+    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|

+    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|

+    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|

+    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|

+    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|

+    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|

+    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|

+    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|

+    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|

+    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|

+    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|

+    "addiu     %[src], %[src], -16         \n"

+    "addiu     $t4, $t4, -1                \n"

+    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|

+    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|

+    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|

+    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|

+    "bgtz      $t4, 1b                     \n"

+    " addiu    %[dst], %[dst], 16          \n"

+    "beqz      $t5, 3f                     \n"

+    " nop                                  \n"

+   "2:                                     \n"

+    "lbu       $t0, -1(%[src])             \n"

+    "addiu     $t5, $t5, -1                \n"

+    "addiu     %[src], %[src], -1          \n"

+    "sb        $t0, 0(%[dst])              \n"

+    "bgez      $t5, 2b                     \n"

+    " addiu    %[dst], %[dst], 1           \n"

+   "3:                                     \n"

+    ".set pop                              \n"

+      : [src] "+r" (src), [dst] "+r" (dst)

+      : [width] "r" (width)

+      : "t0", "t1", "t2", "t3", "t4", "t5"

+  );

+}

+void MirrorUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                            int width) {

+  int x = 0;

+  int y = 0;

+  __asm__ __volatile__ (

+    ".set push                                    \n"

+    ".set noreorder                               \n"

+    "addu            $t4, %[width], %[width]      \n"

+    "srl             %[x], %[width], 4            \n"

+    "andi            %[y], %[width], 0xf          \n"

+    "blez            %[x], 2f                     \n"

+    " addu           %[src_uv], %[src_uv], $t4    \n"

+    ".p2align        2                            \n"

+   "1:                                            \n"

+    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|

+    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|

+    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|

+    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|

+    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|

+    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|

+    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|

+    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|

+    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|

+    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|

+    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|

+    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|

+    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|

+    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|

+    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|

+    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|

+    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|

+    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|

+    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|

+    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|

+    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|

+    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|

+    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|

+    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|

+    "addiu           %[src_uv], %[src_uv], -32    \n"

+    "addiu           %[x], %[x], -1               \n"

+    "swr             $t4, 0(%[dst_u])             \n"

+    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|

+    "swr             $t6, 0(%[dst_v])             \n"

+    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|

+    "swr             $t2, 4(%[dst_u])             \n"

+    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|

+    "swr             $t3, 4(%[dst_v])             \n"

+    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|

+    "swr             $t0, 8(%[dst_u])             \n"

+    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|

+    "swr             $t1, 8(%[dst_v])             \n"

+    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|

+    "swr             $t9, 12(%[dst_u])            \n"

+    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|

+    "swr             $t5, 12(%[dst_v])            \n"

+    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|

+    "addiu           %[dst_v], %[dst_v], 16       \n"

+    "bgtz            %[x], 1b                     \n"

+    " addiu          %[dst_u], %[dst_u], 16       \n"

+    "beqz            %[y], 3f                     \n"

+    " nop                                         \n"

+    "b               2f                           \n"

+    " nop                                         \n"

+   "2:                                            \n"

+    "lbu             $t0, -2(%[src_uv])           \n"

+    "lbu             $t1, -1(%[src_uv])           \n"

+    "addiu           %[src_uv], %[src_uv], -2     \n"

+    "addiu           %[y], %[y], -1               \n"

+    "sb              $t0, 0(%[dst_u])             \n"

+    "sb              $t1, 0(%[dst_v])             \n"

+    "addiu           %[dst_u], %[dst_u], 1        \n"

+    "bgtz            %[y], 2b                     \n"

+    " addiu          %[dst_v], %[dst_v], 1        \n"

+   "3:                                            \n"

+    ".set pop                                     \n"

+      : [src_uv] "+r" (src_uv),

+        [dst_u] "+r" (dst_u),

+        [dst_v] "+r" (dst_v),

+        [x] "=&r" (x),

+        [y] "+r" (y)

+      : [width] "r" (width)

+      : "t0", "t1", "t2", "t3", "t4",

+      "t5", "t7", "t8", "t9"

+  );

+}

+// Convert (4 Y and 2 VU) I422 and arrange RGB values into

+// t5 = | 0 | B0 | 0 | b0 |

+// t4 = | 0 | B1 | 0 | b1 |

+// t9 = | 0 | G0 | 0 | g0 |

+// t8 = | 0 | G1 | 0 | g1 |

+// t2 = | 0 | R0 | 0 | r0 |

+// t1 = | 0 | R1 | 0 | r1 |

+#define I422ToTransientMipsRGB                                                 \

+      "lw                $t0, 0(%[y_buf])       \n"                            \

+      "lhu               $t1, 0(%[u_buf])       \n"                            \

+      "lhu               $t2, 0(%[v_buf])       \n"                            \

+      "preceu.ph.qbr     $t1, $t1               \n"                            \

+      "preceu.ph.qbr     $t2, $t2               \n"                            \

+      "preceu.ph.qbra    $t3, $t0               \n"                            \

+      "preceu.ph.qbla    $t0, $t0               \n"                            \

+      "subu.ph           $t1, $t1, $s5          \n"                            \

+      "subu.ph           $t2, $t2, $s5          \n"                            \

+      "subu.ph           $t3, $t3, $s4          \n"                            \

+      "subu.ph           $t0, $t0, $s4          \n"                            \

+      "mul.ph            $t3, $t3, $s0          \n"                            \

+      "mul.ph            $t0, $t0, $s0          \n"                            \

+      "shll.ph           $t4, $t1, 0x7          \n"                            \

+      "subu.ph           $t4, $t4, $t1          \n"                            \

+      "mul.ph            $t6, $t1, $s1          \n"                            \

+      "mul.ph            $t1, $t2, $s2          \n"                            \

+      "addq_s.ph         $t5, $t4, $t3          \n"                            \

+      "addq_s.ph         $t4, $t4, $t0          \n"                            \

+      "shra.ph           $t5, $t5, 6            \n"                            \

+      "shra.ph           $t4, $t4, 6            \n"                            \

+      "addiu             %[u_buf], 2            \n"                            \

+      "addiu             %[v_buf], 2            \n"                            \

+      "addu.ph           $t6, $t6, $t1          \n"                            \

+      "mul.ph            $t1, $t2, $s3          \n"                            \

+      "addu.ph           $t9, $t6, $t3          \n"                            \

+      "addu.ph           $t8, $t6, $t0          \n"                            \

+      "shra.ph           $t9, $t9, 6            \n"                            \

+      "shra.ph           $t8, $t8, 6            \n"                            \

+      "addu.ph           $t2, $t1, $t3          \n"                            \

+      "addu.ph           $t1, $t1, $t0          \n"                            \

+      "shra.ph           $t2, $t2, 6            \n"                            \

+      "shra.ph           $t1, $t1, 6            \n"                            \

+      "subu.ph           $t5, $t5, $s5          \n"                            \

+      "subu.ph           $t4, $t4, $s5          \n"                            \

+      "subu.ph           $t9, $t9, $s5          \n"                            \

+      "subu.ph           $t8, $t8, $s5          \n"                            \

+      "subu.ph           $t2, $t2, $s5          \n"                            \

+      "subu.ph           $t1, $t1, $s5          \n"                            \

+      "shll_s.ph         $t5, $t5, 8            \n"                            \

+      "shll_s.ph         $t4, $t4, 8            \n"                            \

+      "shll_s.ph         $t9, $t9, 8            \n"                            \

+      "shll_s.ph         $t8, $t8, 8            \n"                            \

+      "shll_s.ph         $t2, $t2, 8            \n"                            \

+      "shll_s.ph         $t1, $t1, 8            \n"                            \

+      "shra.ph           $t5, $t5, 8            \n"                            \

+      "shra.ph           $t4, $t4, 8            \n"                            \

+      "shra.ph           $t9, $t9, 8            \n"                            \

+      "shra.ph           $t8, $t8, 8            \n"                            \

+      "shra.ph           $t2, $t2, 8            \n"                            \

+      "shra.ph           $t1, $t1, 8            \n"                            \

+      "addu.ph           $t5, $t5, $s5          \n"                            \

+      "addu.ph           $t4, $t4, $s5          \n"                            \

+      "addu.ph           $t9, $t9, $s5          \n"                            \

+      "addu.ph           $t8, $t8, $s5          \n"                            \

+      "addu.ph           $t2, $t2, $s5          \n"                            \

+      "addu.ph           $t1, $t1, $s5          \n"

+void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,

+                              const uint8* u_buf,

+                              const uint8* v_buf,

+                              uint8* rgb_buf,

+                              int width) {

+  __asm__ __volatile__ (

+    ".set push                                \n"

+    ".set noreorder                           \n"

+    "beqz              %[width], 2f           \n"

+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|

+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|

+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|

+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|

+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|

+    "repl.ph           $s5, 128               \n"  // |128|128| // clipping

+    "lui               $s6, 0xff00            \n"

+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|

+    ".p2align          2                      \n"

+   "1:                                        \n"

+      I422ToTransientMipsRGB

+// Arranging into argb format

+    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|

+    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|

+    "addiu             %[width], -4           \n"

+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|

+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|

+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|

+    "addiu             %[y_buf], 4            \n"

+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|

+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|

+    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|

+    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|

+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|

+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|

+    "sll               $t9, $t9, 16           \n"

+    "sll               $t8, $t8, 16           \n"

+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|

+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|

+// Store results.

+    "sw                $t2, 0(%[rgb_buf])     \n"

+    "sw                $t0, 4(%[rgb_buf])     \n"

+    "sw                $t1, 8(%[rgb_buf])     \n"

+    "sw                $t3, 12(%[rgb_buf])    \n"

+    "bnez              %[width], 1b           \n"

+    " addiu            %[rgb_buf], 16         \n"

+   "2:                                        \n"

+    ".set pop                                 \n"

+      :[y_buf] "+r" (y_buf),

+       [u_buf] "+r" (u_buf),

+       [v_buf] "+r" (v_buf),

+       [width] "+r" (width),

+       [rgb_buf] "+r" (rgb_buf)

+      :

+      : "t0", "t1",  "t2", "t3",  "t4", "t5",

+      "t6", "t7", "t8", "t9",

+      "s0", "s1", "s2", "s3",

+      "s4", "s5", "s6"

+  );

+}

+void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,

+                              const uint8* u_buf,

+                              const uint8* v_buf,

+                              uint8* rgb_buf,

+                              int width) {

+  __asm__ __volatile__ (

+    ".set push                                \n"

+    ".set noreorder                           \n"

+    "beqz              %[width], 2f           \n"

+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|

+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|

+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|

+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|

+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|

+    "repl.ph           $s5, 128               \n"  // |128|128|

+    "lui               $s6, 0xff00            \n"

+    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|

+    ".p2align          2                       \n"

+   "1:                                         \n"

+      I422ToTransientMipsRGB

+// Arranging into abgr format

+    "precr.qb.ph      $t0, $t8, $t1           \n"  // |G1|g1|R1|r1|

+    "precr.qb.ph      $t3, $t9, $t2           \n"  // |G0|g0|R0|r0|

+    "precrq.qb.ph     $t8, $t0, $t3           \n"  // |G1|R1|G0|R0|

+    "precr.qb.ph      $t9, $t0, $t3           \n"  // |g1|r1|g0|r0|

+    "precr.qb.ph       $t2, $t4, $t5          \n"  // |B1|b1|B0|b0|

+    "addiu             %[width], -4           \n"

+    "addiu             %[y_buf], 4            \n"

+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |B1|0 |B0|

+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |b1|0 |b0|

+    "or                $t1, $t1, $s6          \n"  // |ff|B1|ff|B0|

+    "or                $t2, $t2, $s6          \n"  // |ff|b1|ff|b0|

+    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|b1|g1|r1|

+    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|B1|G1|R1|

+    "sll               $t9, $t9, 16           \n"

+    "sll               $t8, $t8, 16           \n"

+    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|b0|g0|r0|

+    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|B0|G0|R0|

+// Store results.

+    "sw                $t2, 0(%[rgb_buf])     \n"

+    "sw                $t0, 4(%[rgb_buf])     \n"

+    "sw                $t1, 8(%[rgb_buf])     \n"

+    "sw                $t3, 12(%[rgb_buf])    \n"

+    "bnez              %[width], 1b           \n"

+    " addiu            %[rgb_buf], 16         \n"

+   "2:                                        \n"

+    ".set pop                                 \n"

+      :[y_buf] "+r" (y_buf),

+       [u_buf] "+r" (u_buf),

+       [v_buf] "+r" (v_buf),

+       [width] "+r" (width),

+       [rgb_buf] "+r" (rgb_buf)

+      :

+      : "t0", "t1",  "t2", "t3",  "t4", "t5",

+      "t6", "t7", "t8", "t9",

+      "s0", "s1", "s2", "s3",

+      "s4", "s5", "s6"

+  );

+}

+void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,

+                              const uint8* u_buf,

+                              const uint8* v_buf,

+                              uint8* rgb_buf,

+                              int width) {

+  __asm__ __volatile__ (

+    ".set push                                \n"

+    ".set noreorder                           \n"

+    "beqz              %[width], 2f           \n"

+    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74 |74 |

+    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|

+    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|

+    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|

+    "repl.ph           $s4, 16                \n"  // |0|16|0|16|

+    "repl.ph           $s5, 128               \n"  // |128|128|

+    "lui               $s6, 0xff              \n"

+    "ori               $s6, 0xff              \n"  // |00|ff|00|ff|

+    ".p2align          2                      \n"

+   "1:                                        \n"

+      I422ToTransientMipsRGB

+      // Arranging into bgra format

+    "precr.qb.ph       $t4, $t4, $t8          \n"  // |B1|b1|G1|g1|

+    "precr.qb.ph       $t5, $t5, $t9          \n"  // |B0|b0|G0|g0|

+    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |B1|G1|B0|G0|

+    "precr.qb.ph       $t9, $t4, $t5          \n"  // |b1|g1|b0|g0|

+    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|

+    "addiu             %[width], -4           \n"

+    "addiu             %[y_buf], 4            \n"

+    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|

+    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|

+    "sll               $t1, $t1, 8            \n"  // |R1|0 |R0|0 |

+    "sll               $t2, $t2, 8            \n"  // |r1|0 |r0|0 |

+    "or                $t1, $t1, $s6          \n"  // |R1|ff|R0|ff|

+    "or                $t2, $t2, $s6          \n"  // |r1|ff|r0|ff|

+    "precrq.ph.w       $t0, $t9, $t2          \n"  // |b1|g1|r1|ff|

+    "precrq.ph.w       $t3, $t8, $t1          \n"  // |B1|G1|R1|ff|

+    "sll               $t1, $t1, 16           \n"

+    "sll               $t2, $t2, 16           \n"

+    "packrl.ph         $t2, $t9, $t2          \n"  // |b0|g0|r0|ff|

+    "packrl.ph         $t1, $t8, $t1          \n"  // |B0|G0|R0|ff|

+// Store results.

+    "sw                $t2, 0(%[rgb_buf])     \n"

+    "sw                $t0, 4(%[rgb_buf])     \n"

+    "sw                $t1, 8(%[rgb_buf])     \n"

+    "sw                $t3, 12(%[rgb_buf])    \n"

+    "bnez              %[width], 1b           \n"

+    " addiu            %[rgb_buf], 16         \n"

+   "2:                                        \n"

+    ".set pop                                 \n"

+      :[y_buf] "+r" (y_buf),

+       [u_buf] "+r" (u_buf),

+       [v_buf] "+r" (v_buf),

+       [width] "+r" (width),

+       [rgb_buf] "+r" (rgb_buf)

+      :

+      : "t0", "t1",  "t2", "t3",  "t4", "t5",

+      "t6", "t7", "t8", "t9",

+      "s0", "s1", "s2", "s3",

+      "s4", "s5", "s6"

+  );

+}

+// Bilinear filter 8x2 -> 8x1

+void InterpolateRows_MIPS_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

+                                ptrdiff_t src_stride, int dst_width,

+                                int source_y_fraction) {

+    int y0_fraction = 256 - source_y_fraction;

+    const uint8* src_ptr1 = src_ptr + src_stride;

+  __asm__ __volatile__ (

+     ".set push                                           \n"

+     ".set noreorder                                      \n"

+     "replv.ph          $t0, %[y0_fraction]               \n"

+     "replv.ph          $t1, %[source_y_fraction]         \n"

+    ".p2align           2                                 \n"

+   "1:                                                    \n"

+     "lw                $t2, 0(%[src_ptr])                \n"

+     "lw                $t3, 0(%[src_ptr1])               \n"

+     "lw                $t4, 4(%[src_ptr])                \n"

+     "lw                $t5, 4(%[src_ptr1])               \n"

+     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"

+     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"

+     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"

+     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"

+     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"

+     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"

+     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"

+     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"

+     "addq.ph           $t6, $t6, $t8                     \n"

+     "addq.ph           $t7, $t7, $t9                     \n"

+     "addq.ph           $t2, $t2, $t4                     \n"

+     "addq.ph           $t3, $t3, $t5                     \n"

+     "shra.ph           $t6, $t6, 8                       \n"

+     "shra.ph           $t7, $t7, 8                       \n"

+     "shra.ph           $t2, $t2, 8                       \n"

+     "shra.ph           $t3, $t3, 8                       \n"

+     "precr.qb.ph       $t6, $t6, $t7                     \n"

+     "precr.qb.ph       $t2, $t2, $t3                     \n"

+     "addiu             %[src_ptr], %[src_ptr], 8         \n"

+     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"

+     "addiu             %[dst_width], %[dst_width], -8    \n"

+     "sw                $t6, 0(%[dst_ptr])                \n"

+     "sw                $t2, 4(%[dst_ptr])                \n"

+     "bgtz              %[dst_width], 1b                  \n"

+     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"

+     ".set pop                                            \n"

+  : [dst_ptr] "+r" (dst_ptr),

+    [src_ptr1] "+r" (src_ptr1),

+    [src_ptr] "+r" (src_ptr),

+    [dst_width] "+r" (dst_width)

+  : [source_y_fraction] "r" (source_y_fraction),

+    [y0_fraction] "r" (y0_fraction),

+    [src_stride] "r" (src_stride)

+  : "t0", "t1", "t2", "t3", "t4", "t5",

+    "t6", "t7", "t8", "t9"

+  );

+}

+#endif  // __mips_dsp_rev >= 2

+#endif  // defined(__mips__)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/row_neon.cc

@@ -1,0 +1,2844 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for GCC Neon

+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)

+// Read 8 Y, 4 U and 4 V from 422

+#define READYUV422                                                             \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    "vld1.32    {d2[0]}, [%1]!                 \n"                             \

+    "vld1.32    {d2[1]}, [%2]!                 \n"

+// Read 8 Y, 2 U and 2 V from 422

+#define READYUV411                                                             \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    "vld1.16    {d2[0]}, [%1]!                 \n"                             \

+    "vld1.16    {d2[1]}, [%2]!                 \n"                             \

+    "vmov.u8    d3, d2                         \n"                             \

+    "vzip.u8    d2, d3                         \n"

+// Read 8 Y, 8 U and 8 V from 444

+#define READYUV444                                                             \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    "vld1.8     {d2}, [%1]!                    \n"                             \

+    "vld1.8     {d3}, [%2]!                    \n"                             \

+    "vpaddl.u8  q1, q1                         \n"                             \

+    "vrshrn.u16 d2, q1, #1                     \n"

+// Read 8 Y, and set 4 U and 4 V to 128

+#define READYUV400                                                             \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    "vmov.u8    d2, #128                       \n"

+// Read 8 Y and 4 UV from NV12

+#define READNV12                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    "vld1.8     {d2}, [%1]!                    \n"                             \

+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

+    "vuzp.u8    d2, d3                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+// Read 8 Y and 4 VU from NV21

+#define READNV21                                                               \

+    "vld1.8     {d0}, [%0]!                    \n"                             \

+    "vld1.8     {d2}, [%1]!                    \n"                             \

+    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

+    "vuzp.u8    d3, d2                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+// Read 8 YUY2

+#define READYUY2                                                               \

+    "vld2.8     {d0, d2}, [%0]!                \n"                             \

+    "vmov.u8    d3, d2                         \n"                             \

+    "vuzp.u8    d2, d3                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+// Read 8 UYVY

+#define READUYVY                                                               \

+    "vld2.8     {d2, d3}, [%0]!                \n"                             \

+    "vmov.u8    d0, d3                         \n"                             \

+    "vmov.u8    d3, d2                         \n"                             \

+    "vuzp.u8    d2, d3                         \n"                             \

+    "vtrn.u32   d2, d3                         \n"

+#define YUV422TORGB                                                            \

+    "veor.u8    d2, d26                        \n"/*subtract 128 from u and v*/\

+    "vmull.s8   q8, d2, d24                    \n"/*  u/v B/R component      */\

+    "vmull.s8   q9, d2, d25                    \n"/*  u/v G component        */\

+    "vmov.u8    d1, #0                         \n"/*  split odd/even y apart */\

+    "vtrn.u8    d0, d1                         \n"                             \

+    "vsub.s16   q0, q0, q15                    \n"/*  offset y               */\

+    "vmul.s16   q0, q0, q14                    \n"                             \

+    "vadd.s16   d18, d19                       \n"                             \

+    "vqadd.s16  d20, d0, d16                   \n" /* B */                     \

+    "vqadd.s16  d21, d1, d16                   \n"                             \

+    "vqadd.s16  d22, d0, d17                   \n" /* R */                     \

+    "vqadd.s16  d23, d1, d17                   \n"                             \

+    "vqadd.s16  d16, d0, d18                   \n" /* G */                     \

+    "vqadd.s16  d17, d1, d18                   \n"                             \

+    "vqshrun.s16 d0, q10, #6                   \n" /* B */                     \

+    "vqshrun.s16 d1, q11, #6                   \n" /* G */                     \

+    "vqshrun.s16 d2, q8, #6                    \n" /* R */                     \

+    "vmovl.u8   q10, d0                        \n"/*  set up for reinterleave*/\

+    "vmovl.u8   q11, d1                        \n"                             \

+    "vmovl.u8   q8, d2                         \n"                             \

+    "vtrn.u8    d20, d21                       \n"                             \

+    "vtrn.u8    d22, d23                       \n"                             \

+    "vtrn.u8    d16, d17                       \n"                             \

+    "vmov.u8    d21, d16                       \n"

+static vec8 kUVToRB  = { 127, 127, 127, 127, 102, 102, 102, 102,

+                         0, 0, 0, 0, 0, 0, 0, 0 };

+static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,

+                       0, 0, 0, 0, 0, 0, 0, 0 };

+void I444ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV444

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_argb),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I422ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_argb),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I411ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV411

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_argb),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I422ToBGRARow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_bgra,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vswp.u8    d20, d22                       \n"

+    "vmov.u8    d19, #255                      \n"

+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_bgra),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I422ToABGRRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_abgr,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vswp.u8    d20, d22                       \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_abgr),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I422ToRGBARow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_rgba,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d19, #255                      \n"

+    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_u),     // %1

+      "+r"(src_v),     // %2

+      "+r"(dst_rgba),  // %3

+      "+r"(width)      // %4

+    : "r"(&kUVToRB),   // %5

+      "r"(&kUVToG)     // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I422ToRGB24Row_NEON(const uint8* src_y,

+                         const uint8* src_u,

+                         const uint8* src_v,

+                         uint8* dst_rgb24,

+                         int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vst3.8     {d20, d21, d22}, [%3]!         \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),      // %0

+      "+r"(src_u),      // %1

+      "+r"(src_v),      // %2

+      "+r"(dst_rgb24),  // %3

+      "+r"(width)       // %4

+    : "r"(&kUVToRB),    // %5

+      "r"(&kUVToG)      // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I422ToRAWRow_NEON(const uint8* src_y,

+                       const uint8* src_u,

+                       const uint8* src_v,

+                       uint8* dst_raw,

+                       int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vswp.u8    d20, d22                       \n"

+    "vst3.8     {d20, d21, d22}, [%3]!         \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_raw),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#define ARGBTORGB565                                                           \

+    "vshr.u8    d20, d20, #3                   \n"  /* B                    */ \

+    "vshr.u8    d21, d21, #2                   \n"  /* G                    */ \

+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \

+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \

+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \

+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \

+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \

+    "vshl.u16   q10, q10, #11                  \n"  /* R                    */ \

+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \

+    "vorr       q0, q0, q10                    \n"  /* BGR                  */

+void I422ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_u,

+                          const uint8* src_v,

+                          uint8* dst_rgb565,

+                          int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    ARGBTORGB565

+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_rgb565),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#define ARGBTOARGB1555                                                         \

+    "vshr.u8    q10, q10, #3                   \n"  /* B                    */ \

+    "vshr.u8    d22, d22, #3                   \n"  /* R                    */ \

+    "vshr.u8    d23, d23, #7                   \n"  /* A                    */ \

+    "vmovl.u8   q8, d20                        \n"  /* B                    */ \

+    "vmovl.u8   q9, d21                        \n"  /* G                    */ \

+    "vmovl.u8   q10, d22                       \n"  /* R                    */ \

+    "vmovl.u8   q11, d23                       \n"  /* A                    */ \

+    "vshl.u16   q9, q9, #5                     \n"  /* G                    */ \

+    "vshl.u16   q10, q10, #10                  \n"  /* R                    */ \

+    "vshl.u16   q11, q11, #15                  \n"  /* A                    */ \

+    "vorr       q0, q8, q9                     \n"  /* BG                   */ \

+    "vorr       q1, q10, q11                   \n"  /* RA                   */ \

+    "vorr       q0, q0, q1                     \n"  /* BGRA                 */

+void I422ToARGB1555Row_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb1555,

+                            int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    ARGBTOARGB1555

+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_argb1555),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+#define ARGBTOARGB4444                                                         \

+    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \

+    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \

+    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \

+    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \

+    "vorr       d0, d20, d21                   \n"  /* BG                   */ \

+    "vorr       d1, d22, d23                   \n"  /* RA                   */ \

+    "vzip.u8    d0, d1                         \n"  /* BGRA                 */

+void I422ToARGB4444Row_NEON(const uint8* src_y,

+                            const uint8* src_u,

+                            const uint8* src_v,

+                            uint8* dst_argb4444,

+                            int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%5]                    \n"

+    "vld1.8     {d25}, [%6]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV422

+    YUV422TORGB

+    "subs       %4, %4, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    ARGBTOARGB4444

+    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.

+    "bgt        1b                             \n"

+    : "+r"(src_y),    // %0

+      "+r"(src_u),    // %1

+      "+r"(src_v),    // %2

+      "+r"(dst_argb4444),  // %3

+      "+r"(width)     // %4

+    : "r"(&kUVToRB),  // %5

+      "r"(&kUVToG)    // %6

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void YToARGBRow_NEON(const uint8* src_y,

+                     uint8* dst_argb,

+                     int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%3]                    \n"

+    "vld1.8     {d25}, [%4]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUV400

+    YUV422TORGB

+    "subs       %2, %2, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    : "r"(&kUVToRB),   // %3

+      "r"(&kUVToG)     // %4

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void I400ToARGBRow_NEON(const uint8* src_y,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+    "vmov.u8    d23, #255                      \n"

+  "1:                                          \n"

+    "vld1.8     {d20}, [%0]!                   \n"

+    "vmov       d21, d20                       \n"

+    "vmov       d22, d20                       \n"

+    "subs       %2, %2, #8                     \n"

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    :

+    : "cc", "memory", "d20", "d21", "d22", "d23"

+  );

+}

+void NV12ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_uv,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%4]                    \n"

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV12

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_argb),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void NV21ToARGBRow_NEON(const uint8* src_y,

+                        const uint8* src_uv,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%4]                    \n"

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV21

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_argb),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void NV12ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_uv,

+                          uint8* dst_rgb565,

+                          int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%4]                    \n"

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV12

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    ARGBTORGB565

+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_rgb565),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void NV21ToRGB565Row_NEON(const uint8* src_y,

+                          const uint8* src_uv,

+                          uint8* dst_rgb565,

+                          int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%4]                    \n"

+    "vld1.8     {d25}, [%5]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READNV21

+    YUV422TORGB

+    "subs       %3, %3, #8                     \n"

+    ARGBTORGB565

+    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_uv),    // %1

+      "+r"(dst_rgb565),  // %2

+      "+r"(width)      // %3

+    : "r"(&kUVToRB),   // %4

+      "r"(&kUVToG)     // %5

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%3]                    \n"

+    "vld1.8     {d25}, [%4]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READYUY2

+    YUV422TORGB

+    "subs       %2, %2, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_yuy2),  // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    : "r"(&kUVToRB),   // %3

+      "r"(&kUVToG)     // %4

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void UYVYToARGBRow_NEON(const uint8* src_uyvy,

+                        uint8* dst_argb,

+                        int width) {

+  asm volatile (

+    "vld1.8     {d24}, [%3]                    \n"

+    "vld1.8     {d25}, [%4]                    \n"

+    "vmov.u8    d26, #128                      \n"

+    "vmov.u16   q14, #74                       \n"

+    "vmov.u16   q15, #16                       \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    READUYVY

+    YUV422TORGB

+    "subs       %2, %2, #8                     \n"

+    "vmov.u8    d23, #255                      \n"

+    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+    "bgt        1b                             \n"

+    : "+r"(src_uyvy),  // %0

+      "+r"(dst_argb),  // %1

+      "+r"(width)      // %2

+    : "r"(&kUVToRB),   // %3

+      "r"(&kUVToG)     // %4

+    : "cc", "memory", "q0", "q1", "q2", "q3",

+      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.

+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                     int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    "vst1.8     {q0}, [%1]!                    \n"  // store U

+    "vst1.8     {q1}, [%2]!                    \n"  // store V

+    "bgt        1b                             \n"

+    : "+r"(src_uv),  // %0

+      "+r"(dst_u),   // %1

+      "+r"(dst_v),   // %2

+      "+r"(width)    // %3  // Output registers

+    :                       // Input registers

+    : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+// Reads 16 U's and V's and writes out 16 pairs of UV.

+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load U

+    "vld1.8     {q1}, [%1]!                    \n"  // load V

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV

+    "bgt        1b                             \n"

+    :

+      "+r"(src_u),   // %0

+      "+r"(src_v),   // %1

+      "+r"(dst_uv),  // %2

+      "+r"(width)    // %3  // Output registers

+    :                       // Input registers

+    : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.

+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32

+    "subs       %2, %2, #32                    \n"  // 32 processed per loop

+    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32

+    "bgt        1b                             \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(count)  // %2  // Output registers

+  :                     // Input registers

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+// SetRow8 writes 'count' bytes using a 32 bit value repeated.

+void SetRow_NEON(uint8* dst, uint32 v32, int count) {

+  asm volatile (

+    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints

+    "1:                                        \n"

+    "subs      %1, %1, #16                     \n"  // 16 bytes per loop

+    "vst1.8    {q0}, [%0]!                     \n"  // store

+    "bgt       1b                              \n"

+  : "+r"(dst),   // %0

+    "+r"(count)  // %1

+  : "r"(v32)     // %2

+  : "cc", "memory", "q0"

+  );

+}

+// TODO(fbarchard): Make fully assembler

+// SetRow32 writes 'count' words using a 32 bit value repeated.

+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,

+                      int dst_stride, int height) {

+  for (int y = 0; y < height; ++y) {

+    SetRow_NEON(dst, v32, width << 2);

+    dst += dst_stride;

+  }

+}

+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    // Start at end of source row.

+    "mov        r3, #-16                       \n"

+    "add        %0, %0, %2                     \n"

+    "sub        %0, #16                        \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

+    "subs       %2, #16                        \n"  // 16 pixels per loop.

+    "vrev64.8   q0, q0                         \n"

+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+    "vst1.8     {d0}, [%1]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "cc", "memory", "r3", "q0"

+  );

+}

+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                      int width) {

+  asm volatile (

+    // Start at end of source row.

+    "mov        r12, #-16                      \n"

+    "add        %0, %0, %3, lsl #1             \n"

+    "sub        %0, #16                        \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16

+    "subs       %3, #8                         \n"  // 8 pixels per loop.

+    "vrev64.8   q0, q0                         \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8

+    "vst1.8     {d1}, [%2]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_uv),  // %0

+    "+r"(dst_u),   // %1

+    "+r"(dst_v),   // %2

+    "+r"(width)    // %3

+  :

+  : "cc", "memory", "r12", "q0"

+  );

+}

+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    // Start at end of source row.

+    "mov        r3, #-16                       \n"

+    "add        %0, %0, %2, lsl #2             \n"

+    "sub        %0, #16                        \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

+    "subs       %2, #4                         \n"  // 4 pixels per loop.

+    "vrev64.32  q0, q0                         \n"

+    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+    "vst1.8     {d0}, [%1]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "cc", "memory", "r3", "q0"

+  );

+}

+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb24),  // %0

+    "+r"(dst_argb),   // %1

+    "+r"(pix)         // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vswp.u8    d1, d3                         \n"  // swap R, B

+    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_raw),   // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+#define RGB565TOARGB                                                           \

+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \

+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \

+    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \

+    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \

+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \

+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \

+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \

+    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \

+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \

+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    RGB565TOARGB

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb565),  // %0

+    "+r"(dst_argb),    // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+#define ARGB1555TOARGB                                                         \

+    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \

+    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \

+    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \

+    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \

+    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \

+    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \

+    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \

+    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \

+    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \

+    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \

+    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \

+    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \

+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.

+#define RGB555TOARGB                                                           \

+    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \

+    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \

+    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \

+    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \

+    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \

+    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \

+    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \

+    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \

+    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \

+    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

+                            int pix) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB1555TOARGB

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb1555),  // %0

+    "+r"(dst_argb),    // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+#define ARGB4444TOARGB                                                         \

+    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \

+    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \

+    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \

+    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \

+    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \

+    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \

+    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \

+    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */

+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

+                            int pix) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // Alpha

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB4444TOARGB

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb4444),  // %0

+    "+r"(dst_argb),    // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List

+  );

+}

+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_rgb24),  // %1

+    "+r"(pix)         // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vswp.u8    d1, d3                         \n"  // swap R, B

+    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_raw),   // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+  );

+}

+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.

+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.

+    "bgt        1b                             \n"

+  : "+r"(src_yuy2),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.

+    "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.

+    "bgt        1b                             \n"

+  : "+r"(src_uyvy),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.

+    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_yuy2),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

+  );

+}

+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

+    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.

+    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_uyvy),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

+  );

+}

+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // stride + src_yuy2

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.

+    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U

+    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.

+    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_yuy2),     // %0

+    "+r"(stride_yuy2),  // %1

+    "+r"(dst_u),        // %2

+    "+r"(dst_v),        // %3

+    "+r"(pix)           // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

+  );

+}

+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // stride + src_uyvy

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

+    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.

+    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U

+    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.

+    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.

+    "bgt        1b                             \n"

+  : "+r"(src_uyvy),     // %0

+    "+r"(stride_uyvy),  // %1

+    "+r"(dst_u),        // %2

+    "+r"(dst_v),        // %3

+    "+r"(pix)           // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

+  );

+}

+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix) {

+  asm volatile (

+    // change the stride to row 2 pointer

+    "add        %1, %0                         \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load row 1 16 pixels.

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    "vld1.8     {q1}, [%1]!                    \n"  // load row 2 16 pixels.

+    "vrhadd.u8  q0, q1                         \n"  // average row 1 and 2

+    "vst1.8     {q0}, [%2]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_uv),         // %0

+    "+r"(src_uv_stride),  // %1

+    "+r"(dst_uv),         // %2

+    "+r"(pix)             // %3

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+// Select 2 channels from ARGB on alternating pixels.  e.g.  BGBGBGBG

+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,

+                         uint32 selector, int pix) {

+  asm volatile (

+    "vmov.u32   d6[0], %3                      \n"  // selector

+  "1:                                          \n"

+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 8 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop

+    "vtbl.8     d4, {d0, d1}, d6               \n"  // look up 4 pixels

+    "vtbl.8     d5, {d2, d3}, d6               \n"  // look up 4 pixels

+    "vtrn.u32   d4, d5                         \n"  // combine 8 pixels

+    "vst1.8     {d4}, [%1]!                    \n"  // store 8.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_bayer),  // %1

+    "+r"(pix)         // %2

+  : "r"(selector)     // %3

+  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+// Select G channels from ARGB.  e.g.  GGGGGGGG

+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,

+                           uint32 /*selector*/, int pix) {

+  asm volatile (

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop

+    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_bayer),  // %1

+    "+r"(pix)         // %2

+  :

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix) {

+  asm volatile (

+    "vld1.8     {q2}, [%3]                     \n"  // shuffler

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.

+    "subs       %2, %2, #4                     \n"  // 4 processed per loop

+    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels

+    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels

+    "vst1.8     {q1}, [%1]!                    \n"  // store 4.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  : "r"(shuffler)    // %3

+  : "cc", "memory", "q0", "q1", "q2"  // Clobber List

+  );

+}

+void I422ToYUY2Row_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_yuy2, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys

+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us

+    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs

+    "subs       %4, %4, #16                    \n"  // 16 pixels

+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_y),     // %0

+    "+r"(src_u),     // %1

+    "+r"(src_v),     // %2

+    "+r"(dst_yuy2),  // %3

+    "+r"(width)      // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"

+  );

+}

+void I422ToUYVYRow_NEON(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_uyvy, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys

+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us

+    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs

+    "subs       %4, %4, #16                    \n"  // 16 pixels

+    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_y),     // %0

+    "+r"(src_u),     // %1

+    "+r"(src_v),     // %2

+    "+r"(dst_uyvy),  // %3

+    "+r"(width)      // %4

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3"

+  );

+}

+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGBTORGB565

+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_rgb565),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

+  );

+}

+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,

+                            int pix) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGBTOARGB1555

+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb1555),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

+  );

+}

+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,

+                            int pix) {

+  asm volatile (

+    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGBTOARGB4444

+    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),      // %0

+    "+r"(dst_argb4444),  // %1

+    "+r"(pix)            // %2

+  :

+  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

+  );

+}

+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

+  );

+}

+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

+  );

+}

+// 8x1 pixels.

+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient

+    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient

+    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient

+    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient

+    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlsl.u8   q2, d1, d25                    \n"  // G

+    "vmlsl.u8   q2, d2, d26                    \n"  // R

+    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned

+    "vmull.u8   q3, d2, d24                    \n"  // R

+    "vmlsl.u8   q3, d1, d28                    \n"  // G

+    "vmlsl.u8   q3, d0, d27                    \n"  // B

+    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"

+  );

+}

+// 16x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q0, q10                    \n"  // B

+    "vmls.s16   q8, q1, q11                    \n"  // G

+    "vmls.s16   q8, q2, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q2, q10                    \n"  // R

+    "vmls.s16   q9, q1, q14                    \n"  // G

+    "vmls.s16   q9, q0, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// 32x1 pixels -> 8x1.  pix is number of argb pixels. e.g. 32.

+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                         int pix) {

+  asm volatile (

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.

+    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.

+    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.

+    "vpadd.u16  d1, d8, d9                     \n"  // B

+    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.

+    "vpadd.u16  d3, d10, d11                   \n"  // G

+    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.

+    "vpadd.u16  d5, d12, d13                   \n"  // R

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %3, %3, #32                    \n"  // 32 processed per loop.

+    "vmul.s16   q8, q0, q10                    \n"  // B

+    "vmls.s16   q8, q1, q11                    \n"  // G

+    "vmls.s16   q8, q2, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q2, q10                    \n"  // R

+    "vmls.s16   q9, q1, q14                    \n"  // G

+    "vmls.s16   q9, q0, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_u),     // %1

+    "+r"(dst_v),     // %2

+    "+r"(pix)        // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+#define RGBTOUV(QB, QG, QR) \

+    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \

+    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \

+    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \

+    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \

+    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \

+    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \

+    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \

+    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \

+    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \

+    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */

+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.

+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(src_stride_argb),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// TODO(fbarchard): Subsample match C code.

+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient

+    "vmov.s16   q11, #84 / 2                   \n"  // UG -0.33126 coefficient

+    "vmov.s16   q12, #43 / 2                   \n"  // UR -0.16874 coefficient

+    "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient

+    "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(src_stride_argb),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_bgra

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.

+    "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.

+    "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q1, q1, #1                     \n"  // 2x average

+    "vrshr.u16  q2, q2, #1                     \n"

+    "vrshr.u16  q3, q3, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q3, q2, q1)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_bgra),  // %0

+    "+r"(src_stride_bgra),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_abgr

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.

+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.

+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q2, q1, q0)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_abgr),  // %0

+    "+r"(src_stride_abgr),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_rgba

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.

+    "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.

+    "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.

+    "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.

+    "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q7                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_rgba),  // %0

+    "+r"(src_stride_rgba),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

+                       uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_rgb24

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.

+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.

+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.

+    "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q6                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q0, q1, q2)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb24),  // %0

+    "+r"(src_stride_rgb24),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

+                     uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_raw

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.

+    "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.

+    "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

+    "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.

+    "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.

+    "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q0, q4                         \n"  // R 16 bytes -> 8 shorts.

+    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

+    "vrshr.u16  q1, q1, #1                     \n"

+    "vrshr.u16  q2, q2, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 32 processed per loop.

+    RGBTOUV(q2, q1, q0)

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_raw),  // %0

+    "+r"(src_stride_raw),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

+                        uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.

+    RGB565TOARGB

+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+    "vrshr.u16  q5, q5, #1                     \n"

+    "vrshr.u16  q6, q6, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q4, q10                    \n"  // B

+    "vmls.s16   q8, q5, q11                    \n"  // G

+    "vmls.s16   q8, q6, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q6, q10                    \n"  // R

+    "vmls.s16   q9, q5, q14                    \n"  // G

+    "vmls.s16   q9, q4, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb565),  // %0

+    "+r"(src_stride_rgb565),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

+                        uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.

+    RGB555TOARGB

+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+    "vrshr.u16  q5, q5, #1                     \n"

+    "vrshr.u16  q6, q6, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q4, q10                    \n"  // B

+    "vmls.s16   q8, q5, q11                    \n"  // G

+    "vmls.s16   q8, q6, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q6, q10                    \n"  // R

+    "vmls.s16   q9, q5, q14                    \n"  // G

+    "vmls.s16   q9, q4, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb1555),  // %0

+    "+r"(src_stride_argb1555),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// 16x2 pixels -> 8x1.  pix is number of argb pixels. e.g. 16.

+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

+                          uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "add        %1, %0, %1                     \n"  // src_stride + src_argb

+    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

+    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+    "vmov.u16   q15, #0x8080                   \n"  // 128.5

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.

+    ARGB4444TOARGB

+    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+    "vrshr.u16  q5, q5, #1                     \n"

+    "vrshr.u16  q6, q6, #1                     \n"

+    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+    "vmul.s16   q8, q4, q10                    \n"  // B

+    "vmls.s16   q8, q5, q11                    \n"  // G

+    "vmls.s16   q8, q6, q12                    \n"  // R

+    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+    "vmul.s16   q9, q6, q10                    \n"  // R

+    "vmls.s16   q9, q5, q14                    \n"  // G

+    "vmls.s16   q9, q4, q13                    \n"  // B

+    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+    "bgt        1b                             \n"

+  : "+r"(src_argb4444),  // %0

+    "+r"(src_stride_argb4444),  // %1

+    "+r"(dst_u),     // %2

+    "+r"(dst_v),     // %3

+    "+r"(pix)        // %4

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

+    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    RGB565TOARGB

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb565),  // %0

+    "+r"(dst_y),       // %1

+    "+r"(pix)          // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

+  );

+}

+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB1555TOARGB

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb1555),  // %0

+    "+r"(dst_y),         // %1

+    "+r"(pix)            // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

+  );

+}

+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+    "vmov.u8    d27, #16                       \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    ARGB4444TOARGB

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d27                        \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_argb4444),  // %0

+    "+r"(dst_y),         // %1

+    "+r"(pix)            // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

+  );

+}

+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d1, d4                     \n"  // R

+    "vmlal.u8   q8, d2, d5                     \n"  // G

+    "vmlal.u8   q8, d3, d6                     \n"  // B

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_bgra),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d0, d4                     \n"  // R

+    "vmlal.u8   q8, d1, d5                     \n"  // G

+    "vmlal.u8   q8, d2, d6                     \n"  // B

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_abgr),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d1, d4                     \n"  // B

+    "vmlal.u8   q8, d2, d5                     \n"  // G

+    "vmlal.u8   q8, d3, d6                     \n"  // R

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_rgba),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d0, d4                     \n"  // B

+    "vmlal.u8   q8, d1, d5                     \n"  // G

+    "vmlal.u8   q8, d2, d6                     \n"  // R

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_rgb24),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {

+  asm volatile (

+    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+    "vmov.u8    d7, #16                        \n"  // Add 16 constant

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q8, d0, d4                     \n"  // B

+    "vmlal.u8   q8, d1, d5                     \n"  // G

+    "vmlal.u8   q8, d2, d6                     \n"  // R

+    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+    "vqadd.u8   d0, d7                         \n"

+    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+    "bgt        1b                             \n"

+  : "+r"(src_raw),  // %0

+    "+r"(dst_y),  // %1

+    "+r"(pix)        // %2

+  :

+  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

+  );

+}

+// Bilinear filter 16x2 -> 16x1

+void InterpolateRow_NEON(uint8* dst_ptr,

+                         const uint8* src_ptr, ptrdiff_t src_stride,

+                         int dst_width, int source_y_fraction) {

+  asm volatile (

+    "cmp        %4, #0                         \n"

+    "beq        100f                           \n"

+    "add        %2, %1                         \n"

+    "cmp        %4, #64                        \n"

+    "beq        75f                            \n"

+    "cmp        %4, #128                       \n"

+    "beq        50f                            \n"

+    "cmp        %4, #192                       \n"

+    "beq        25f                            \n"

+    "vdup.8     d5, %4                         \n"

+    "rsb        %4, #256                       \n"

+    "vdup.8     d4, %4                         \n"

+    // General purpose row blend.

+  "1:                                          \n"

+    "vld1.8     {q0}, [%1]!                    \n"

+    "vld1.8     {q1}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vmull.u8   q13, d0, d4                    \n"

+    "vmull.u8   q14, d1, d4                    \n"

+    "vmlal.u8   q13, d2, d5                    \n"

+    "vmlal.u8   q14, d3, d5                    \n"

+    "vrshrn.u16 d0, q13, #8                    \n"

+    "vrshrn.u16 d1, q14, #8                    \n"

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        1b                             \n"

+    "b          99f                            \n"

+    // Blend 25 / 75.

+  "25:                                         \n"

+    "vld1.8     {q0}, [%1]!                    \n"

+    "vld1.8     {q1}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vrhadd.u8  q0, q1                         \n"

+    "vrhadd.u8  q0, q1                         \n"

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        25b                            \n"

+    "b          99f                            \n"

+    // Blend 50 / 50.

+  "50:                                         \n"

+    "vld1.8     {q0}, [%1]!                    \n"

+    "vld1.8     {q1}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vrhadd.u8  q0, q1                         \n"

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        50b                            \n"

+    "b          99f                            \n"

+    // Blend 75 / 25.

+  "75:                                         \n"

+    "vld1.8     {q1}, [%1]!                    \n"

+    "vld1.8     {q0}, [%2]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vrhadd.u8  q0, q1                         \n"

+    "vrhadd.u8  q0, q1                         \n"

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        75b                            \n"

+    "b          99f                            \n"

+    // Blend 100 / 0 - Copy row unchanged.

+  "100:                                        \n"

+    "vld1.8     {q0}, [%1]!                    \n"

+    "subs       %3, %3, #16                    \n"

+    "vst1.8     {q0}, [%0]!                    \n"

+    "bgt        100b                           \n"

+  "99:                                         \n"

+  : "+r"(dst_ptr),          // %0

+    "+r"(src_ptr),          // %1

+    "+r"(src_stride),       // %2

+    "+r"(dst_width),        // %3

+    "+r"(source_y_fraction) // %4

+  :

+  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"

+  );

+}

+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr

+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                       uint8* dst_argb, int width) {

+  asm volatile (

+    "subs       %3, #8                         \n"

+    "blt        89f                            \n"

+    // Blend 8 pixels.

+  "8:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q10, d4, d3                    \n"  // db * a

+    "vmull.u8   q11, d5, d3                    \n"  // dg * a

+    "vmull.u8   q12, d6, d3                    \n"  // dr * a

+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8

+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8

+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8

+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256

+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256

+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg

+    "vqadd.u8   d2, d2, d6                     \n"  // + sr

+    "vmov.u8    d3, #255                       \n"  // a = 255

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.

+    "bge        8b                             \n"

+  "89:                                         \n"

+    "adds       %3, #8-1                       \n"

+    "blt        99f                            \n"

+    // Blend 1 pixels.

+  "1:                                          \n"

+    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.

+    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.

+    "subs       %3, %3, #1                     \n"  // 1 processed per loop.

+    "vmull.u8   q10, d4, d3                    \n"  // db * a

+    "vmull.u8   q11, d5, d3                    \n"  // dg * a

+    "vmull.u8   q12, d6, d3                    \n"  // dr * a

+    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8

+    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8

+    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8

+    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256

+    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256

+    "vqadd.u8   q0, q0, q2                     \n"  // + sbg

+    "vqadd.u8   d2, d2, d6                     \n"  // + sr

+    "vmov.u8    d3, #255                       \n"  // a = 255

+    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.

+    "bge        1b                             \n"

+  "99:                                         \n"

+  : "+r"(src_argb0),    // %0

+    "+r"(src_argb1),    // %1

+    "+r"(dst_argb),     // %2

+    "+r"(width)         // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"

+  );

+}

+// Attenuate 8 pixels at a time.

+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

+  asm volatile (

+    // Attenuate 8 pixels.

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q10, d0, d3                    \n"  // b * a

+    "vmull.u8   q11, d1, d3                    \n"  // g * a

+    "vmull.u8   q12, d2, d3                    \n"  // r * a

+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8

+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8

+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_argb),   // %1

+    "+r"(width)       // %2

+  :

+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"

+  );

+}

+// Quantize 8 ARGB pixels (32 bytes).

+// dst = (dst * scale >> 16) * interval_size + interval_offset;

+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,

+                          int interval_offset, int width) {

+  asm volatile (

+    "vdup.u16   q8, %2                         \n"

+    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1

+    "vdup.u16   q9, %3                         \n"  // interval multiply.

+    "vdup.u16   q10, %4                        \n"  // interval add

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.

+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.

+    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)

+    "vmovl.u8   q1, d2                         \n"

+    "vmovl.u8   q2, d4                         \n"

+    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale

+    "vqdmulh.s16 q1, q1, q8                    \n"  // g

+    "vqdmulh.s16 q2, q2, q8                    \n"  // r

+    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size

+    "vmul.u16   q1, q1, q9                     \n"  // g

+    "vmul.u16   q2, q2, q9                     \n"  // r

+    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset

+    "vadd.u16   q1, q1, q10                    \n"  // g

+    "vadd.u16   q2, q2, q10                    \n"  // r

+    "vqmovn.u16 d0, q0                         \n"

+    "vqmovn.u16 d2, q1                         \n"

+    "vqmovn.u16 d4, q2                         \n"

+    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(dst_argb),       // %0

+    "+r"(width)           // %1

+  : "r"(scale),           // %2

+    "r"(interval_size),   // %3

+    "r"(interval_offset)  // %4

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"

+  );

+}

+// Shade 8 pixels at a time by specified value.

+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.

+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.

+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,

+                       uint32 value) {

+  asm volatile (

+    "vdup.u32   q0, %3                         \n"  // duplicate scale value.

+    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.

+    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)

+    "vmovl.u8   q11, d22                       \n"

+    "vmovl.u8   q12, d24                       \n"

+    "vmovl.u8   q13, d26                       \n"

+    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2

+    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g

+    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r

+    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a

+    "vqmovn.u16 d20, q10                       \n"

+    "vqmovn.u16 d22, q11                       \n"

+    "vqmovn.u16 d24, q12                       \n"

+    "vqmovn.u16 d26, q13                       \n"

+    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),       // %0

+    "+r"(dst_argb),       // %1

+    "+r"(width)           // %2

+  : "r"(value)            // %3

+  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"

+  );

+}

+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels

+// Similar to ARGBToYJ but stores ARGB.

+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;

+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

+    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

+    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d24                    \n"  // B

+    "vmlal.u8   q2, d1, d25                    \n"  // G

+    "vmlal.u8   q2, d2, d26                    \n"  // R

+    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B

+    "vmov       d1, d0                         \n"  // G

+    "vmov       d2, d0                         \n"  // R

+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(width)      // %2

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

+  );

+}

+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

+//    b = (r * 35 + g * 68 + b * 17) >> 7

+//    g = (r * 45 + g * 88 + b * 22) >> 7

+//    r = (r * 50 + g * 98 + b * 24) >> 7

+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d20, #17                       \n"  // BB coefficient

+    "vmov.u8    d21, #68                       \n"  // BG coefficient

+    "vmov.u8    d22, #35                       \n"  // BR coefficient

+    "vmov.u8    d24, #22                       \n"  // GB coefficient

+    "vmov.u8    d25, #88                       \n"  // GG coefficient

+    "vmov.u8    d26, #45                       \n"  // GR coefficient

+    "vmov.u8    d28, #24                       \n"  // BB coefficient

+    "vmov.u8    d29, #98                       \n"  // BG coefficient

+    "vmov.u8    d30, #50                       \n"  // BR coefficient

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.

+    "subs       %1, %1, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B

+    "vmlal.u8   q2, d1, d21                    \n"  // G

+    "vmlal.u8   q2, d2, d22                    \n"  // R

+    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G

+    "vmlal.u8   q3, d1, d25                    \n"  // G

+    "vmlal.u8   q3, d2, d26                    \n"  // R

+    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R

+    "vmlal.u8   q8, d1, d29                    \n"  // G

+    "vmlal.u8   q8, d2, d30                    \n"  // R

+    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B

+    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G

+    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R

+    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(dst_argb),  // %0

+    "+r"(width)      // %1

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3",

+    "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// Tranform 8 ARGB pixels (32 bytes) with color matrix.

+// TODO(fbarchard): Was same as Sepia except matrix is provided.  This function

+// needs to saturate.  Consider doing a non-saturating version.

+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

+                             const int8* matrix_argb, int width) {

+  asm volatile (

+    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.

+    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.

+    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit

+    "vmovl.u8   q9, d18                        \n"  // g

+    "vmovl.u8   q10, d20                       \n"  // r

+    "vmovl.u8   q15, d22                       \n"  // a

+    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B

+    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G

+    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R

+    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A

+    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B

+    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G

+    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R

+    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A

+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B

+    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G

+    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R

+    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A

+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+    "vmul.s16   q4, q15, d0[3]                 \n"  // B += A * Matrix B

+    "vmul.s16   q5, q15, d1[3]                 \n"  // G += A * Matrix G

+    "vmul.s16   q6, q15, d2[3]                 \n"  // R += A * Matrix R

+    "vmul.s16   q7, q15, d3[3]                 \n"  // A += A * Matrix A

+    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B

+    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G

+    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R

+    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A

+    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_argb),   // %1

+    "+r"(width)       // %2

+  : "r"(matrix_argb)  // %3

+  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",

+    "q10", "q11", "q12", "q13", "q14", "q15"

+  );

+}

+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.

+#ifdef HAS_ARGBMULTIPLYROW_NEON

+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  asm volatile (

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vmull.u8   q0, d0, d1                     \n"  // multiply B

+    "vmull.u8   q1, d2, d3                     \n"  // multiply G

+    "vmull.u8   q2, d4, d5                     \n"  // multiply R

+    "vmull.u8   q3, d6, d7                     \n"  // multiply A

+    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B

+    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G

+    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R

+    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"

+  );

+}

+#endif  // HAS_ARGBMULTIPLYROW_NEON

+// Add 2 rows of ARGB pixels together, 8 pixels at a time.

+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqadd.u8   q0, q0, q2                     \n"  // add B, G

+    "vqadd.u8   q1, q1, q3                     \n"  // add R, A

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"

+  );

+}

+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.

+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  asm volatile (

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G

+    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "cc", "memory", "q0", "q1", "q2", "q3"

+  );

+}

+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.

+// A = 255

+// R = Sobel

+// G = Sobel

+// B = Sobel

+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // alpha

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.

+    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqadd.u8   d0, d0, d1                     \n"  // add

+    "vmov.u8    d1, d0                         \n"

+    "vmov.u8    d2, d0                         \n"

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_argb),    // %2

+    "+r"(width)        // %3

+  :

+  : "cc", "memory", "q0", "q1"

+  );

+}

+// Adds Sobel X and Sobel Y and stores Sobel into plane.

+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                          uint8* dst_y, int width) {

+  asm volatile (

+    // 16 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.

+    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop.

+    "vqadd.u8   q0, q0, q1                     \n"  // add

+    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_y),       // %2

+    "+r"(width)        // %3

+  :

+  : "cc", "memory", "q0", "q1"

+  );

+}

+// Mixes Sobel X, Sobel Y and Sobel into ARGB.

+// A = 255

+// R = Sobel X

+// G = Sobel

+// B = Sobel Y

+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    "vmov.u8    d3, #255                       \n"  // alpha

+    // 8 pixel loop.

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.

+    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vqadd.u8   d1, d0, d2                     \n"  // add

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+    "bgt        1b                             \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_argb),    // %2

+    "+r"(width)        // %3

+  :

+  : "cc", "memory", "q0", "q1"

+  );

+}

+// SobelX as a matrix is

+// -1  0  1

+// -2  0  2

+// -1  0  1

+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

+                    const uint8* src_y2, uint8* dst_sobelx, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {d0}, [%0],%5                  \n"  // top

+    "vld1.8     {d1}, [%0],%6                  \n"

+    "vsubl.u8   q0, d0, d1                     \n"

+    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2

+    "vld1.8     {d3}, [%1],%6                  \n"

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vld1.8     {d2}, [%2],%5                  \n"  // bottom

+    "vld1.8     {d3}, [%2],%6                  \n"

+    "subs       %4, %4, #8                     \n"  // 8 pixels

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vabs.s16   q0, q0                         \n"

+    "vqmovn.u16 d0, q0                         \n"

+    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx

+    "bgt        1b                             \n"

+  : "+r"(src_y0),      // %0

+    "+r"(src_y1),      // %1

+    "+r"(src_y2),      // %2

+    "+r"(dst_sobelx),  // %3

+    "+r"(width)        // %4

+  : "r"(2),            // %5

+    "r"(6)             // %6

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+// SobelY as a matrix is

+// -1 -2 -1

+//  0  0  0

+//  1  2  1

+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

+                    uint8* dst_sobely, int width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {d0}, [%0],%4                  \n"  // left

+    "vld1.8     {d1}, [%1],%4                  \n"

+    "vsubl.u8   q0, d0, d1                     \n"

+    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2

+    "vld1.8     {d3}, [%1],%4                  \n"

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vld1.8     {d2}, [%0],%5                  \n"  // right

+    "vld1.8     {d3}, [%1],%5                  \n"

+    "subs       %3, %3, #8                     \n"  // 8 pixels

+    "vsubl.u8   q1, d2, d3                     \n"

+    "vadd.s16   q0, q0, q1                     \n"

+    "vabs.s16   q0, q0                         \n"

+    "vqmovn.u16 d0, q0                         \n"

+    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely

+    "bgt        1b                             \n"

+  : "+r"(src_y0),      // %0

+    "+r"(src_y1),      // %1

+    "+r"(dst_sobely),  // %2

+    "+r"(width)        // %3

+  : "r"(1),            // %4

+    "r"(6)             // %5

+  : "cc", "memory", "q0", "q1"  // Clobber List

+  );

+}

+#endif  // __ARM_NEON__

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/row_posix.cc

@@ -1,0 +1,6443 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for GCC x86 and x64.

+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

+// Constants for ARGB

+static vec8 kARGBToY = {

+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0

+};

+// JPeg full range.

+static vec8 kARGBToYJ = {

+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0

+};

+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

+#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)

+static vec8 kARGBToU = {

+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0

+};

+static vec8 kARGBToUJ = {

+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0

+};

+static vec8 kARGBToV = {

+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,

+};

+static vec8 kARGBToVJ = {

+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0

+};

+// Constants for BGRA

+static vec8 kBGRAToY = {

+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13

+};

+static vec8 kBGRAToU = {

+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112

+};

+static vec8 kBGRAToV = {

+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18

+};

+// Constants for ABGR

+static vec8 kABGRToY = {

+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0

+};

+static vec8 kABGRToU = {

+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0

+};

+static vec8 kABGRToV = {

+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0

+};

+// Constants for RGBA.

+static vec8 kRGBAToY = {

+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33

+};

+static vec8 kRGBAToU = {

+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38

+};

+static vec8 kRGBAToV = {

+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112

+};

+static uvec8 kAddY16 = {

+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u

+};

+static vec16 kAddYJ64 = {

+  64, 64, 64, 64, 64, 64, 64, 64

+};

+static uvec8 kAddUV128 = {

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

+static uvec16 kAddUVJ128 = {

+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u

+};

+#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)

+#ifdef HAS_RGB24TOARGBROW_SSSE3

+// Shuffle table for converting RGB24 to ARGB.

+static uvec8 kShuffleMaskRGB24ToARGB = {

+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u

+};

+// Shuffle table for converting RAW to ARGB.

+static uvec8 kShuffleMaskRAWToARGB = {

+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u

+};

+// Shuffle table for converting ARGB to RGB24.

+static uvec8 kShuffleMaskARGBToRGB24 = {

+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting ARGB to RAW.

+static uvec8 kShuffleMaskARGBToRAW = {

+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4

+static uvec8 kShuffleMaskARGBToRGB24_0 = {

+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u

+};

+// Shuffle table for converting ARGB to RAW.

+static uvec8 kShuffleMaskARGBToRAW_0 = {

+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u

+};

+#endif  // HAS_RGB24TOARGBROW_SSSE3

+#if defined(TESTING) && defined(__x86_64__)

+void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

+  asm volatile (

+    ".p2align  5                               \n"

+    "mov       %%eax,%%eax                     \n"

+    "mov       %%ebx,%%ebx                     \n"

+    "mov       %%ecx,%%ecx                     \n"

+    "mov       %%edx,%%edx                     \n"

+    "mov       %%esi,%%esi                     \n"

+    "mov       %%edi,%%edi                     \n"

+    "mov       %%ebp,%%ebp                     \n"

+    "mov       %%esp,%%esp                     \n"

+    ".p2align  5                               \n"

+    "mov       %%r8d,%%r8d                     \n"

+    "mov       %%r9d,%%r9d                     \n"

+    "mov       %%r10d,%%r10d                   \n"

+    "mov       %%r11d,%%r11d                   \n"

+    "mov       %%r12d,%%r12d                   \n"

+    "mov       %%r13d,%%r13d                   \n"

+    "mov       %%r14d,%%r14d                   \n"

+    "mov       %%r15d,%%r15d                   \n"

+    ".p2align  5                               \n"

+    "lea       (%%rax),%%eax                   \n"

+    "lea       (%%rbx),%%ebx                   \n"

+    "lea       (%%rcx),%%ecx                   \n"

+    "lea       (%%rdx),%%edx                   \n"

+    "lea       (%%rsi),%%esi                   \n"

+    "lea       (%%rdi),%%edi                   \n"

+    "lea       (%%rbp),%%ebp                   \n"

+    "lea       (%%rsp),%%esp                   \n"

+    ".p2align  5                               \n"

+    "lea       (%%r8),%%r8d                    \n"

+    "lea       (%%r9),%%r9d                    \n"

+    "lea       (%%r10),%%r10d                  \n"

+    "lea       (%%r11),%%r11d                  \n"

+    "lea       (%%r12),%%r12d                  \n"

+    "lea       (%%r13),%%r13d                  \n"

+    "lea       (%%r14),%%r14d                  \n"

+    "lea       (%%r15),%%r15d                  \n"

+    ".p2align  5                               \n"

+    "lea       0x10(%%rax),%%eax               \n"

+    "lea       0x10(%%rbx),%%ebx               \n"

+    "lea       0x10(%%rcx),%%ecx               \n"

+    "lea       0x10(%%rdx),%%edx               \n"

+    "lea       0x10(%%rsi),%%esi               \n"

+    "lea       0x10(%%rdi),%%edi               \n"

+    "lea       0x10(%%rbp),%%ebp               \n"

+    "lea       0x10(%%rsp),%%esp               \n"

+    ".p2align  5                               \n"

+    "lea       0x10(%%r8),%%r8d                \n"

+    "lea       0x10(%%r9),%%r9d                \n"

+    "lea       0x10(%%r10),%%r10d              \n"

+    "lea       0x10(%%r11),%%r11d              \n"

+    "lea       0x10(%%r12),%%r12d              \n"

+    "lea       0x10(%%r13),%%r13d              \n"

+    "lea       0x10(%%r14),%%r14d              \n"

+    "lea       0x10(%%r15),%%r15d              \n"

+    ".p2align  5                               \n"

+    "add       0x10,%%eax                      \n"

+    "add       0x10,%%ebx                      \n"

+    "add       0x10,%%ecx                      \n"

+    "add       0x10,%%edx                      \n"

+    "add       0x10,%%esi                      \n"

+    "add       0x10,%%edi                      \n"

+    "add       0x10,%%ebp                      \n"

+    "add       0x10,%%esp                      \n"

+    ".p2align  5                               \n"

+    "add       0x10,%%r8d                      \n"

+    "add       0x10,%%r9d                      \n"

+    "add       0x10,%%r10d                     \n"

+    "add       0x10,%%r11d                     \n"

+    "add       0x10,%%r12d                     \n"

+    "add       0x10,%%r13d                     \n"

+    "add       0x10,%%r14d                     \n"

+    "add       0x10,%%r15d                     \n"

+    ".p2align  2                               \n"

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src_y),     // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // TESTING

+#ifdef HAS_I400TOARGBROW_SSE2

+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pslld     $0x18,%%xmm5                    \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm0,%%xmm0                   \n"

+    "punpckhwd %%xmm1,%%xmm1                   \n"

+    "por       %%xmm5,%%xmm0                   \n"

+    "por       %%xmm5,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src_y),     // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,

+                                  int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pslld     $0x18,%%xmm5                    \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm0,%%xmm0                   \n"

+    "punpckhwd %%xmm1,%%xmm1                   \n"

+    "por       %%xmm5,%%xmm0                   \n"

+    "por       %%xmm5,%%xmm1                   \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src_y),     // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_I400TOARGBROW_SSE2

+#ifdef HAS_RGB24TOARGBROW_SSSE3

+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000

+    "pslld     $0x18,%%xmm5                    \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"

+    "lea       " MEMLEA(0x30,0) ",%0           \n"

+    "movdqa    %%xmm3,%%xmm2                   \n"

+    "palignr   $0x8,%%xmm1,%%xmm2              \n"

+    "pshufb    %%xmm4,%%xmm2                   \n"

+    "por       %%xmm5,%%xmm2                   \n"

+    "palignr   $0xc,%%xmm0,%%xmm1              \n"

+    "pshufb    %%xmm4,%%xmm0                   \n"

+    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"

+    "por       %%xmm5,%%xmm0                   \n"

+    "pshufb    %%xmm4,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "por       %%xmm5,%%xmm1                   \n"

+    "palignr   $0x4,%%xmm3,%%xmm3              \n"

+    "pshufb    %%xmm4,%%xmm3                   \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "por       %%xmm5,%%xmm3                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_rgb24),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  : "m"(kShuffleMaskRGB24ToARGB)  // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000

+    "pslld     $0x18,%%xmm5                    \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"

+    "lea       " MEMLEA(0x30,0) ",%0           \n"

+    "movdqa    %%xmm3,%%xmm2                   \n"

+    "palignr   $0x8,%%xmm1,%%xmm2              \n"

+    "pshufb    %%xmm4,%%xmm2                   \n"

+    "por       %%xmm5,%%xmm2                   \n"

+    "palignr   $0xc,%%xmm0,%%xmm1              \n"

+    "pshufb    %%xmm4,%%xmm0                   \n"

+    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"

+    "por       %%xmm5,%%xmm0                   \n"

+    "pshufb    %%xmm4,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "por       %%xmm5,%%xmm1                   \n"

+    "palignr   $0x4,%%xmm3,%%xmm3              \n"

+    "pshufb    %%xmm4,%%xmm3                   \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "por       %%xmm5,%%xmm3                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_raw),   // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  : "m"(kShuffleMaskRAWToARGB)  // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "mov       $0x1080108,%%eax                \n"

+    "movd      %%eax,%%xmm5                    \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    "mov       $0x20802080,%%eax               \n"

+    "movd      %%eax,%%xmm6                    \n"

+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"

+    "pcmpeqb   %%xmm3,%%xmm3                   \n"

+    "psllw     $0xb,%%xmm3                     \n"

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "psllw     $0xa,%%xmm4                     \n"

+    "psrlw     $0x5,%%xmm4                     \n"

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"

+    "psllw     $0x8,%%xmm7                     \n"

+    "sub       %0,%1                           \n"

+    "sub       %0,%1                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "pand      %%xmm3,%%xmm1                   \n"

+    "psllw     $0xb,%%xmm2                     \n"

+    "pmulhuw   %%xmm5,%%xmm1                   \n"

+    "pmulhuw   %%xmm5,%%xmm2                   \n"

+    "psllw     $0x8,%%xmm1                     \n"

+    "por       %%xmm2,%%xmm1                   \n"

+    "pand      %%xmm4,%%xmm0                   \n"

+    "pmulhuw   %%xmm6,%%xmm0                   \n"

+    "por       %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm1,%%xmm2                   \n"

+    "punpcklbw %%xmm0,%%xmm1                   \n"

+    "punpckhbw %%xmm0,%%xmm2                   \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)

+    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  :

+  : "memory", "cc", "eax"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "mov       $0x1080108,%%eax                \n"

+    "movd      %%eax,%%xmm5                    \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    "mov       $0x42004200,%%eax               \n"

+    "movd      %%eax,%%xmm6                    \n"

+    "pshufd    $0x0,%%xmm6,%%xmm6              \n"

+    "pcmpeqb   %%xmm3,%%xmm3                   \n"

+    "psllw     $0xb,%%xmm3                     \n"

+    "movdqa    %%xmm3,%%xmm4                   \n"

+    "psrlw     $0x6,%%xmm4                     \n"

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"

+    "psllw     $0x8,%%xmm7                     \n"

+    "sub       %0,%1                           \n"

+    "sub       %0,%1                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "psllw     $0x1,%%xmm1                     \n"

+    "psllw     $0xb,%%xmm2                     \n"

+    "pand      %%xmm3,%%xmm1                   \n"

+    "pmulhuw   %%xmm5,%%xmm2                   \n"

+    "pmulhuw   %%xmm5,%%xmm1                   \n"

+    "psllw     $0x8,%%xmm1                     \n"

+    "por       %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "pand      %%xmm4,%%xmm0                   \n"

+    "psraw     $0x8,%%xmm2                     \n"

+    "pmulhuw   %%xmm6,%%xmm0                   \n"

+    "pand      %%xmm7,%%xmm2                   \n"

+    "por       %%xmm2,%%xmm0                   \n"

+    "movdqa    %%xmm1,%%xmm2                   \n"

+    "punpcklbw %%xmm0,%%xmm1                   \n"

+    "punpckhbw %%xmm0,%%xmm2                   \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)

+    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  :

+  : "memory", "cc", "eax"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "mov       $0xf0f0f0f,%%eax                \n"

+    "movd      %%eax,%%xmm4                    \n"

+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"

+    "movdqa    %%xmm4,%%xmm5                   \n"

+    "pslld     $0x4,%%xmm5                     \n"

+    "sub       %0,%1                           \n"

+    "sub       %0,%1                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "pand      %%xmm4,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm3                   \n"

+    "psllw     $0x4,%%xmm1                     \n"

+    "psrlw     $0x4,%%xmm3                     \n"

+    "por       %%xmm1,%%xmm0                   \n"

+    "por       %%xmm3,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm2,%%xmm0                   \n"

+    "punpckhbw %%xmm2,%%xmm1                   \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)

+    MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  :

+  : "memory", "cc", "eax"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "movdqa    %3,%%xmm6                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "pshufb    %%xmm6,%%xmm0                   \n"

+    "pshufb    %%xmm6,%%xmm1                   \n"

+    "pshufb    %%xmm6,%%xmm2                   \n"

+    "pshufb    %%xmm6,%%xmm3                   \n"

+    "movdqa    %%xmm1,%%xmm4                   \n"

+    "psrldq    $0x4,%%xmm1                     \n"

+    "pslldq    $0xc,%%xmm4                     \n"

+    "movdqa    %%xmm2,%%xmm5                   \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pslldq    $0x8,%%xmm5                     \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "por       %%xmm5,%%xmm1                   \n"

+    "psrldq    $0x8,%%xmm2                     \n"

+    "pslldq    $0x4,%%xmm3                     \n"

+    "por       %%xmm3,%%xmm2                   \n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"

+    "lea       " MEMLEA(0x30,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  : "m"(kShuffleMaskARGBToRGB24)  // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "movdqa    %3,%%xmm6                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "pshufb    %%xmm6,%%xmm0                   \n"

+    "pshufb    %%xmm6,%%xmm1                   \n"

+    "pshufb    %%xmm6,%%xmm2                   \n"

+    "pshufb    %%xmm6,%%xmm3                   \n"

+    "movdqa    %%xmm1,%%xmm4                   \n"

+    "psrldq    $0x4,%%xmm1                     \n"

+    "pslldq    $0xc,%%xmm4                     \n"

+    "movdqa    %%xmm2,%%xmm5                   \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pslldq    $0x8,%%xmm5                     \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "por       %%xmm5,%%xmm1                   \n"

+    "psrldq    $0x8,%%xmm2                     \n"

+    "pslldq    $0x4,%%xmm3                     \n"

+    "por       %%xmm3,%%xmm2                   \n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"

+    "lea       " MEMLEA(0x30,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  : "m"(kShuffleMaskARGBToRAW)  // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm3,%%xmm3                   \n"

+    "psrld     $0x1b,%%xmm3                    \n"

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "psrld     $0x1a,%%xmm4                    \n"

+    "pslld     $0x5,%%xmm4                     \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pslld     $0xb,%%xmm5                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "pslld     $0x8,%%xmm0                     \n"

+    "psrld     $0x3,%%xmm1                     \n"

+    "psrld     $0x5,%%xmm2                     \n"

+    "psrad     $0x10,%%xmm0                    \n"

+    "pand      %%xmm3,%%xmm1                   \n"

+    "pand      %%xmm4,%%xmm2                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "por       %%xmm2,%%xmm1                   \n"

+    "por       %%xmm1,%%xmm0                   \n"

+    "packssdw  %%xmm0,%%xmm0                   \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x4,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "psrld     $0x1b,%%xmm4                    \n"

+    "movdqa    %%xmm4,%%xmm5                   \n"

+    "pslld     $0x5,%%xmm5                     \n"

+    "movdqa    %%xmm4,%%xmm6                   \n"

+    "pslld     $0xa,%%xmm6                     \n"

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"

+    "pslld     $0xf,%%xmm7                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm3                   \n"

+    "psrad     $0x10,%%xmm0                    \n"

+    "psrld     $0x3,%%xmm1                     \n"

+    "psrld     $0x6,%%xmm2                     \n"

+    "psrld     $0x9,%%xmm3                     \n"

+    "pand      %%xmm7,%%xmm0                   \n"

+    "pand      %%xmm4,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm2                   \n"

+    "pand      %%xmm6,%%xmm3                   \n"

+    "por       %%xmm1,%%xmm0                   \n"

+    "por       %%xmm3,%%xmm2                   \n"

+    "por       %%xmm2,%%xmm0                   \n"

+    "packssdw  %%xmm0,%%xmm0                   \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMACCESS2(0x8,1) ",%1        \n"

+    "sub       $0x4,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "psllw     $0xc,%%xmm4                     \n"

+    "movdqa    %%xmm4,%%xmm3                   \n"

+    "psrlw     $0x8,%%xmm3                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm3,%%xmm0                   \n"

+    "pand      %%xmm4,%%xmm1                   \n"

+    "psrlq     $0x4,%%xmm0                     \n"

+    "psrlq     $0x8,%%xmm1                     \n"

+    "por       %%xmm1,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x4,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(pix)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

+#endif

+  );

+}

+#endif  // HAS_RGB24TOARGBROW_SSSE3

+#ifdef HAS_ARGBTOYROW_SSSE3

+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kARGBToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kARGBToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBTOYROW_SSSE3

+#ifdef HAS_ARGBTOYJROW_SSSE3

+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %3,%%xmm4                       \n"

+    "movdqa    %4,%%xmm5                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "paddw     %%xmm5,%%xmm0                   \n"

+    "paddw     %%xmm5,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kARGBToYJ),  // %3

+    "m"(kAddYJ64)    // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %3,%%xmm4                       \n"

+    "movdqa    %4,%%xmm5                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "paddw     %%xmm5,%%xmm0                   \n"

+    "paddw     %%xmm5,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kARGBToYJ),  // %3

+    "m"(kAddYJ64)    // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBTOYJROW_SSSE3

+#ifdef HAS_ARGBTOUVROW_SSSE3

+// TODO(fbarchard): pass xmm constants to single block of assembly.

+// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes

+// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,

+// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around

+// and considered unsafe.

+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToU),  // %0

+    "m"(kARGBToV),  // %1

+    "m"(kAddUV128)  // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0

+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1

+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2

+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_argb)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.

+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                        uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToUJ),  // %0

+    "m"(kARGBToVJ),  // %1

+    "m"(kAddUVJ128)  // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0

+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1

+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2

+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "paddw     %%xmm5,%%xmm0                   \n"

+    "paddw     %%xmm5,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_argb)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToU),         // %0

+    "m"(kARGBToV),         // %1

+    "m"(kAddUV128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm1                   \n"

+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_argb)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                                  uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToUJ),         // %0

+    "m"(kARGBToVJ),         // %1

+    "m"(kAddUVJ128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm1                   \n"

+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "paddw     %%xmm5,%%xmm0                   \n"

+    "paddw     %%xmm5,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_argb))

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+                          int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToU),  // %0

+    "m"(kARGBToV),  // %1

+    "m"(kAddUV128)  // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm6                   \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm2                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm2                     \n"

+    "packsswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    "pmaddubsw %%xmm3,%%xmm0                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm2                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm2                     \n"

+    "packsswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),        // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6"

+#endif

+  );

+}

+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,

+                                    uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToU),  // %0

+    "m"(kARGBToV),  // %1

+    "m"(kAddUV128)  // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm6                   \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm2                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm2                     \n"

+    "packsswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    "pmaddubsw %%xmm3,%%xmm0                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm2                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm2                     \n"

+    "packsswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),        // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6"

+#endif

+  );

+}

+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,

+                          uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToU),  // %0

+    "m"(kARGBToV),  // %1

+    "m"(kAddUV128)  // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,

+                                    uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kARGBToU),  // %0

+    "m"(kARGBToV),  // %1

+    "m"(kAddUV128)  // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_bgra),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kBGRAToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_bgra),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kBGRAToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kBGRAToU),         // %0

+    "m"(kBGRAToV),         // %1

+    "m"(kAddUV128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0

+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1

+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2

+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_bgra0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_bgra)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kBGRAToU),         // %0

+    "m"(kBGRAToV),         // %1

+    "m"(kAddUV128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm1                   \n"

+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_bgra0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_bgra)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_abgr),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kABGRToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_abgr),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kABGRToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_rgba),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kRGBAToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {

+  asm volatile (

+    "movdqa    %4,%%xmm5                       \n"

+    "movdqa    %3,%%xmm4                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm4,%%xmm3                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "phaddw    %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm2                     \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_rgba),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  : "m"(kRGBAToY),   // %3

+    "m"(kAddY16)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kABGRToU),         // %0

+    "m"(kABGRToV),         // %1

+    "m"(kAddUV128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0

+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1

+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2

+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_abgr0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_abgr)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kABGRToU),         // %0

+    "m"(kABGRToV),         // %1

+    "m"(kAddUV128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm1                   \n"

+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_abgr0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_abgr)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kRGBAToU),         // %0

+    "m"(kRGBAToV),         // %1

+    "m"(kAddUV128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0

+    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1

+    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2

+    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_rgba0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_rgba))

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kRGBAToU),         // %0

+    "m"(kRGBAToV),         // %1

+    "m"(kAddUV128)         // %2

+  );

+  asm volatile (

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm1                   \n"

+    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm7                   \n"

+    "shufps    $0x88,%%xmm6,%%xmm2             \n"

+    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+    "pavgb     %%xmm7,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm2                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "phaddw    %%xmm2,%%xmm0                   \n"

+    "phaddw    %%xmm6,%%xmm1                   \n"

+    "psraw     $0x8,%%xmm0                     \n"

+    "psraw     $0x8,%%xmm1                     \n"

+    "packsswb  %%xmm1,%%xmm0                   \n"

+    "paddb     %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movlps    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_rgba0),       // %0

+    "+r"(dst_u),           // %1

+    "+r"(dst_v),           // %2

+    "+rm"(width)           // %3

+  : "r"((intptr_t)(src_stride_rgba)) // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_ARGBTOUVROW_SSSE3

+#ifdef HAS_I422TOARGBROW_SSSE3

+#define UB 127 /* min(63,(int8)(2.018 * 64)) */

+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */

+#define UR 0

+#define VB 0

+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */

+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */

+// Bias

+#define BB UB * 128 + VB * 128

+#define BG UG * 128 + VG * 128

+#define BR UR * 128 + VR * 128

+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */

+struct {

+  vec8 kUVToB;  // 0

+  vec8 kUVToG;  // 16

+  vec8 kUVToR;  // 32

+  vec16 kUVBiasB;  // 48

+  vec16 kUVBiasG;  // 64

+  vec16 kUVBiasR;  // 80

+  vec16 kYSub16;  // 96

+  vec16 kYToRgb;  // 112

+  vec8 kVUToB;  // 128

+  vec8 kVUToG;  // 144

+  vec8 kVUToR;  // 160

+} static SIMD_ALIGNED(kYuvConstants) = {

+  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },

+  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

+  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },

+  { BB, BB, BB, BB, BB, BB, BB, BB },

+  { BG, BG, BG, BG, BG, BG, BG, BG },

+  { BR, BR, BR, BR, BR, BR, BR, BR },

+  { 16, 16, 16, 16, 16, 16, 16, 16 },

+  { YG, YG, YG, YG, YG, YG, YG, YG },

+  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },

+  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

+  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }

+};

+// Read 8 UV from 411

+#define READYUV444                                                             \

+    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

+    BUNDLEALIGN                                                                \

+    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

+    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \

+    "punpcklbw  %%xmm1,%%xmm0                                   \n"

+// Read 4 UV from 422, upsample to 8 UV

+#define READYUV422                                                             \

+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

+    BUNDLEALIGN                                                                \

+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

+    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \

+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

+    "punpcklwd  %%xmm0,%%xmm0                                   \n"

+// Read 2 UV from 411, upsample to 8 UV

+#define READYUV411                                                             \

+    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

+    BUNDLEALIGN                                                                \

+    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

+    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \

+    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

+    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

+    "punpckldq  %%xmm0,%%xmm0                                   \n"

+// Read 4 UV from NV12, upsample to 8 UV

+#define READNV12                                                               \

+    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \

+    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \

+    "punpcklwd  %%xmm0,%%xmm0                                   \n"

+// Convert 8 pixels: 8 UV and 8 Y

+#define YUVTORGB                                                               \

+    "movdqa     %%xmm0,%%xmm1                                   \n"            \

+    "movdqa     %%xmm0,%%xmm2                                   \n"            \

+    "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \

+    "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \

+    "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \

+    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \

+    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \

+    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \

+    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \

+    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \

+    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \

+    "paddsw     %%xmm3,%%xmm0                                   \n"            \

+    "paddsw     %%xmm3,%%xmm1                                   \n"            \

+    "paddsw     %%xmm3,%%xmm2                                   \n"            \

+    "psraw      $0x6,%%xmm0                                     \n"            \

+    "psraw      $0x6,%%xmm1                                     \n"            \

+    "psraw      $0x6,%%xmm2                                     \n"            \

+    "packuswb   %%xmm0,%%xmm0                                   \n"            \

+    "packuswb   %%xmm1,%%xmm1                                   \n"            \

+    "packuswb   %%xmm2,%%xmm2                                   \n"

+// Convert 8 pixels: 8 VU and 8 Y

+#define YVUTORGB                                                               \

+    "movdqa     %%xmm0,%%xmm1                                   \n"            \

+    "movdqa     %%xmm0,%%xmm2                                   \n"            \

+    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \

+    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \

+    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \

+    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \

+    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \

+    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \

+    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \

+    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \

+    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \

+    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \

+    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \

+    "paddsw     %%xmm3,%%xmm0                                   \n"            \

+    "paddsw     %%xmm3,%%xmm1                                   \n"            \

+    "paddsw     %%xmm3,%%xmm2                                   \n"            \

+    "psraw      $0x6,%%xmm0                                     \n"            \

+    "psraw      $0x6,%%xmm1                                     \n"            \

+    "psraw      $0x6,%%xmm2                                     \n"            \

+    "packuswb   %%xmm0,%%xmm0                                   \n"            \

+    "packuswb   %%xmm1,%%xmm1                                   \n"            \

+    "packuswb   %%xmm2,%%xmm2                                   \n"

+void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,

+                                const uint8* u_buf,

+                                const uint8* v_buf,

+                                uint8* dst_argb,

+                                int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV444

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,

+                                 const uint8* u_buf,

+                                 const uint8* v_buf,

+                                 uint8* dst_rgb24,

+                                 int width) {

+// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.

+#if defined(__i386__)

+  asm volatile (

+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"

+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"

+  :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),

+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));

+#endif

+  asm volatile (

+#if !defined(__i386__)

+    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"

+    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"

+#endif

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm2,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "pshufb    %%xmm5,%%xmm0                   \n"

+    "pshufb    %%xmm6,%%xmm1                   \n"

+    "palignr   $0xc,%%xmm0,%%xmm1              \n"

+    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"

+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"

+    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)

+#if !defined(__i386__)

+    , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),

+    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)

+#endif

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,

+                               const uint8* u_buf,

+                               const uint8* v_buf,

+                               uint8* dst_raw,

+                               int width) {

+// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.

+#if defined(__i386__)

+  asm volatile (

+    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"

+    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"

+  :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),

+    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));

+#endif

+  asm volatile (

+#if !defined(__i386__)

+    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"

+    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"

+#endif

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm2,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "pshufb    %%xmm5,%%xmm0                   \n"

+    "pshufb    %%xmm6,%%xmm1                   \n"

+    "palignr   $0xc,%%xmm0,%%xmm1              \n"

+    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"

+    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"

+    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_raw]"+r"(dst_raw),  // %[dst_raw]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)

+#if !defined(__i386__)

+    , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),

+    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)

+#endif

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,

+                                const uint8* u_buf,

+                                const uint8* v_buf,

+                                uint8* dst_argb,

+                                int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,

+                                const uint8* u_buf,

+                                const uint8* v_buf,

+                                uint8* dst_argb,

+                                int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV411

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,

+                                const uint8* uv_buf,

+                                uint8* dst_argb,

+                                int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READNV12

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+  // Does not use r14.

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,

+                                const uint8* uv_buf,

+                                uint8* dst_argb,

+                                int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READNV12

+    YVUTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+  // Does not use r14.

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* u_buf,

+                                          const uint8* v_buf,

+                                          uint8* dst_argb,

+                                          int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV444

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* u_buf,

+                                          const uint8* v_buf,

+                                          uint8* dst_argb,

+                                          int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* u_buf,

+                                          const uint8* v_buf,

+                                          uint8* dst_argb,

+                                          int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV411

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* uv_buf,

+                                          uint8* dst_argb,

+                                          int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READNV12

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+  // Does not use r14.

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* uv_buf,

+                                          uint8* dst_argb,

+                                          int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READNV12

+    YVUTORGB

+    "punpcklbw %%xmm1,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm0                   \n"

+    "punpckhwd %%xmm2,%%xmm1                   \n"

+    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"

+    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [uv_buf]"+r"(uv_buf),    // %[uv_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+  // Does not use r14.

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,

+                                const uint8* u_buf,

+                                const uint8* v_buf,

+                                uint8* dst_bgra,

+                                int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "punpcklbw %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm2,%%xmm5                   \n"

+    "movdqa    %%xmm5,%%xmm0                   \n"

+    "punpcklwd %%xmm1,%%xmm5                   \n"

+    "punpckhwd %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"

+    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"

+    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,

+                                const uint8* u_buf,

+                                const uint8* v_buf,

+                                uint8* dst_abgr,

+                                int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm2                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm1                   \n"

+    "punpcklwd %%xmm0,%%xmm2                   \n"

+    "punpckhwd %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"

+    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,

+                                const uint8* u_buf,

+                                const uint8* v_buf,

+                                uint8* dst_rgba,

+                                int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "punpcklbw %%xmm2,%%xmm1                   \n"

+    "punpcklbw %%xmm0,%%xmm5                   \n"

+    "movdqa    %%xmm5,%%xmm0                   \n"

+    "punpcklwd %%xmm1,%%xmm5                   \n"

+    "punpckhwd %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"

+    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"

+    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* u_buf,

+                                          const uint8* v_buf,

+                                          uint8* dst_bgra,

+                                          int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "punpcklbw %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm2,%%xmm5                   \n"

+    "movdqa    %%xmm5,%%xmm0                   \n"

+    "punpcklwd %%xmm1,%%xmm5                   \n"

+    "punpckhwd %%xmm1,%%xmm0                   \n"

+    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"

+    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"

+    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* u_buf,

+                                          const uint8* v_buf,

+                                          uint8* dst_abgr,

+                                          int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "punpcklbw %%xmm1,%%xmm2                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm1                   \n"

+    "punpcklwd %%xmm0,%%xmm2                   \n"

+    "punpckhwd %%xmm0,%%xmm1                   \n"

+    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"

+    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,

+                                          const uint8* u_buf,

+                                          const uint8* v_buf,

+                                          uint8* dst_rgba,

+                                          int width) {

+  asm volatile (

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    LABELALIGN

+  "1:                                          \n"

+    READYUV422

+    YUVTORGB

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "punpcklbw %%xmm2,%%xmm1                   \n"

+    "punpcklbw %%xmm0,%%xmm5                   \n"

+    "movdqa    %%xmm5,%%xmm0                   \n"

+    "punpcklwd %%xmm1,%%xmm5                   \n"

+    "punpckhwd %%xmm1,%%xmm0                   \n"

+    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"

+    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"

+    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]

+    [width]"+rm"(width)    // %[width]

+  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_I422TOARGBROW_SSSE3

+#ifdef HAS_YTOARGBROW_SSE2

+void YToARGBRow_SSE2(const uint8* y_buf,

+                     uint8* dst_argb,

+                     int width) {

+  asm volatile (

+    "pxor      %%xmm5,%%xmm5                   \n"

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "pslld     $0x18,%%xmm4                    \n"

+    "mov       $0x00100010,%%eax               \n"

+    "movd      %%eax,%%xmm3                    \n"

+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"

+    "mov       $0x004a004a,%%eax               \n"

+    "movd      %%eax,%%xmm2                    \n"

+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"

+    LABELALIGN

+  "1:                                          \n"

+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "psubusw   %%xmm3,%%xmm0                   \n"

+    "pmullw    %%xmm2,%%xmm0                   \n"

+    "psrlw     $6, %%xmm0                      \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    // Step 2: Weave into ARGB

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm0,%%xmm0                   \n"

+    "punpckhwd %%xmm1,%%xmm1                   \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "por       %%xmm4,%%xmm1                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(y_buf),     // %0

+    "+r"(dst_argb),  // %1

+    "+rm"(width)     // %2

+  :

+  : "memory", "cc", "eax"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

+#endif

+  );

+}

+#endif  // HAS_YTOARGBROW_SSE2

+#ifdef HAS_MIRRORROW_SSSE3

+// Shuffle table for reversing the bytes.

+static uvec8 kShuffleMirror = {

+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

+};

+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

+  intptr_t temp_width = (intptr_t)(width);

+  asm volatile (

+    "movdqa    %3,%%xmm5                       \n"

+    "lea       " MEMLEA(-0x10,0) ",%0          \n"

+    LABELALIGN

+  "1:                                          \n"

+    MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0

+    "pshufb    %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(temp_width)  // %2

+  : "m"(kShuffleMirror) // %3

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_MIRRORROW_SSSE3

+#ifdef HAS_MIRRORROW_SSE2

+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

+  intptr_t temp_width = (intptr_t)(width);

+  asm volatile (

+    "lea       " MEMLEA(-0x10,0) ",%0          \n"

+    LABELALIGN

+  "1:                                          \n"

+    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "psllw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm1,%%xmm0                   \n"

+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"

+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"

+    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1)",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(temp_width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+#endif  // HAS_MIRRORROW_SSE2

+#ifdef HAS_MIRRORROW_UV_SSSE3

+// Shuffle table for reversing the bytes of UV channels.

+static uvec8 kShuffleMirrorUV = {

+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

+};

+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,

+                       int width) {

+  intptr_t temp_width = (intptr_t)(width);

+  asm volatile (

+    "movdqa    %4,%%xmm1                       \n"

+    "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(-0x10,0) ",%0            \n"

+    "pshufb    %%xmm1,%%xmm0                   \n"

+    "sub       $8,%3                           \n"

+    "movlpd    %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src),      // %0

+    "+r"(dst_u),    // %1

+    "+r"(dst_v),    // %2

+    "+r"(temp_width)  // %3

+  : "m"(kShuffleMirrorUV)  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+#endif  // HAS_MIRRORROW_UV_SSSE3

+#ifdef HAS_ARGBMIRRORROW_SSSE3

+// Shuffle table for reversing the bytes.

+static uvec8 kARGBShuffleMirror = {

+  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u

+};

+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

+  intptr_t temp_width = (intptr_t)(width);

+  asm volatile (

+    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"

+    "movdqa    %3,%%xmm5                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "pshufb    %%xmm5,%%xmm0                   \n"

+    "lea       " MEMLEA(-0x10,0) ",%0          \n"

+    "sub       $0x4,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src),  // %0

+    "+r"(dst),  // %1

+    "+r"(temp_width)  // %2

+  : "m"(kARGBShuffleMirror)  // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBMIRRORROW_SSSE3

+#ifdef HAS_SPLITUVROW_SSE2

+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb    %%xmm5,%%xmm5                    \n"

+    "psrlw      $0x8,%%xmm5                      \n"

+    "sub        %1,%2                            \n"

+    LABELALIGN

+  "1:                                            \n"

+    "movdqa     " MEMACCESS(0) ",%%xmm0          \n"

+    "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"

+    "lea        " MEMLEA(0x20,0) ",%0            \n"

+    "movdqa     %%xmm0,%%xmm2                    \n"

+    "movdqa     %%xmm1,%%xmm3                    \n"

+    "pand       %%xmm5,%%xmm0                    \n"

+    "pand       %%xmm5,%%xmm1                    \n"

+    "packuswb   %%xmm1,%%xmm0                    \n"

+    "psrlw      $0x8,%%xmm2                      \n"

+    "psrlw      $0x8,%%xmm3                      \n"

+    "packuswb   %%xmm3,%%xmm2                    \n"

+    "movdqa     %%xmm0," MEMACCESS(1) "          \n"

+    MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)

+    "lea        " MEMLEA(0x10,1) ",%1            \n"

+    "sub        $0x10,%3                         \n"

+    "jg         1b                               \n"

+  : "+r"(src_uv),     // %0

+    "+r"(dst_u),      // %1

+    "+r"(dst_v),      // %2

+    "+r"(pix)         // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                               int pix) {

+  asm volatile (

+    "pcmpeqb    %%xmm5,%%xmm5                    \n"

+    "psrlw      $0x8,%%xmm5                      \n"

+    "sub        %1,%2                            \n"

+    LABELALIGN

+  "1:                                            \n"

+    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"

+    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"

+    "lea        " MEMLEA(0x20,0) ",%0            \n"

+    "movdqa     %%xmm0,%%xmm2                    \n"

+    "movdqa     %%xmm1,%%xmm3                    \n"

+    "pand       %%xmm5,%%xmm0                    \n"

+    "pand       %%xmm5,%%xmm1                    \n"

+    "packuswb   %%xmm1,%%xmm0                    \n"

+    "psrlw      $0x8,%%xmm2                      \n"

+    "psrlw      $0x8,%%xmm3                      \n"

+    "packuswb   %%xmm3,%%xmm2                    \n"

+    "movdqu     %%xmm0," MEMACCESS(1) "          \n"

+    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)

+    "lea        " MEMLEA(0x10,1) ",%1            \n"

+    "sub        $0x10,%3                         \n"

+    "jg         1b                               \n"

+  : "+r"(src_uv),     // %0

+    "+r"(dst_u),      // %1

+    "+r"(dst_v),      // %2

+    "+r"(pix)         // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_SPLITUVROW_SSE2

+#ifdef HAS_MERGEUVROW_SSE2

+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width) {

+  asm volatile (

+    "sub       %0,%1                             \n"

+    LABELALIGN

+  "1:                                            \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0           \n"

+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1

+    "lea       " MEMLEA(0x10,0) ",%0             \n"

+    "movdqa    %%xmm0,%%xmm2                     \n"

+    "punpcklbw %%xmm1,%%xmm0                     \n"

+    "punpckhbw %%xmm1,%%xmm2                     \n"

+    "movdqa    %%xmm0," MEMACCESS(2) "           \n"

+    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"

+    "lea       " MEMLEA(0x20,2) ",%2             \n"

+    "sub       $0x10,%3                          \n"

+    "jg        1b                                \n"

+  : "+r"(src_u),     // %0

+    "+r"(src_v),     // %1

+    "+r"(dst_uv),    // %2

+    "+r"(width)      // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2"

+#endif

+  );

+}

+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,

+                               uint8* dst_uv, int width) {

+  asm volatile (

+    "sub       %0,%1                             \n"

+    LABELALIGN

+  "1:                                            \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"

+    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1

+    "lea       " MEMLEA(0x10,0) ",%0             \n"

+    "movdqa    %%xmm0,%%xmm2                     \n"

+    "punpcklbw %%xmm1,%%xmm0                     \n"

+    "punpckhbw %%xmm1,%%xmm2                     \n"

+    "movdqu    %%xmm0," MEMACCESS(2) "           \n"

+    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"

+    "lea       " MEMLEA(0x20,2) ",%2             \n"

+    "sub       $0x10,%3                          \n"

+    "jg        1b                                \n"

+  : "+r"(src_u),     // %0

+    "+r"(src_v),     // %1

+    "+r"(dst_uv),    // %2

+    "+r"(width)      // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2"

+#endif

+  );

+}

+#endif  // HAS_MERGEUVROW_SSE2

+#ifdef HAS_COPYROW_SSE2

+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x20,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(count)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+#endif  // HAS_COPYROW_SSE2

+#ifdef HAS_COPYROW_X86

+void CopyRow_X86(const uint8* src, uint8* dst, int width) {

+  size_t width_tmp = (size_t)(width);

+  asm volatile (

+    "shr       $0x2,%2                         \n"

+    "rep movsl " MEMMOVESTRING(0,1) "          \n"

+  : "+S"(src),  // %0

+    "+D"(dst),  // %1

+    "+c"(width_tmp) // %2

+  :

+  : "memory", "cc"

+  );

+}

+#endif  // HAS_COPYROW_X86

+#ifdef HAS_COPYROW_ERMS

+// Unaligned Multiple of 1.

+void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {

+  size_t width_tmp = (size_t)(width);

+  asm volatile (

+    "rep movsb " MEMMOVESTRING(0,1) "          \n"

+  : "+S"(src),  // %0

+    "+D"(dst),  // %1

+    "+c"(width_tmp) // %2

+  :

+  : "memory", "cc"

+  );

+}

+#endif  // HAS_COPYROW_ERMS

+#ifdef HAS_ARGBCOPYALPHAROW_SSE2

+// width in pixels

+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm0,%%xmm0                   \n"

+    "pslld     $0x18,%%xmm0                    \n"

+    "pcmpeqb   %%xmm1,%%xmm1                   \n"

+    "psrld     $0x8,%%xmm1                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"

+    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"

+    "pand      %%xmm0,%%xmm2                   \n"

+    "pand      %%xmm0,%%xmm3                   \n"

+    "pand      %%xmm1,%%xmm4                   \n"

+    "pand      %%xmm1,%%xmm5                   \n"

+    "por       %%xmm4,%%xmm2                   \n"

+    "por       %%xmm5,%%xmm3                   \n"

+    "movdqa    %%xmm2," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBCOPYALPHAROW_SSE2

+#ifdef HAS_ARGBCOPYALPHAROW_AVX2

+// width in pixels

+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"

+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"

+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"

+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"

+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"

+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2"

+#endif

+  );

+}

+#endif  // HAS_ARGBCOPYALPHAROW_AVX2

+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

+// width in pixels

+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm0,%%xmm0                   \n"

+    "pslld     $0x18,%%xmm0                    \n"

+    "pcmpeqb   %%xmm1,%%xmm1                   \n"

+    "psrld     $0x8,%%xmm1                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "punpcklbw %%xmm2,%%xmm2                   \n"

+    "punpckhwd %%xmm2,%%xmm3                   \n"

+    "punpcklwd %%xmm2,%%xmm2                   \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"

+    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"

+    "pand      %%xmm0,%%xmm2                   \n"

+    "pand      %%xmm0,%%xmm3                   \n"

+    "pand      %%xmm1,%%xmm4                   \n"

+    "pand      %%xmm1,%%xmm5                   \n"

+    "por       %%xmm4,%%xmm2                   \n"

+    "por       %%xmm5,%%xmm3                   \n"

+    "movdqa    %%xmm2," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2

+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2

+// width in pixels

+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

+  asm volatile (

+    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"

+    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"

+    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "vpslld    $0x18,%%ymm1,%%ymm1             \n"

+    "vpslld    $0x18,%%ymm2,%%ymm2             \n"

+    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"

+    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"

+    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"

+    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src),   // %0

+    "+r"(dst),   // %1

+    "+r"(width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2"

+#endif

+  );

+}

+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2

+#ifdef HAS_SETROW_X86

+void SetRow_X86(uint8* dst, uint32 v32, int width) {

+  size_t width_tmp = (size_t)(width);

+  asm volatile (

+    "shr       $0x2,%1                         \n"

+    "rep stosl " MEMSTORESTRING(eax,0) "       \n"

+    : "+D"(dst),       // %0

+      "+c"(width_tmp)  // %1

+    : "a"(v32)         // %2

+    : "memory", "cc");

+}

+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,

+                   int dst_stride, int height) {

+  for (int y = 0; y < height; ++y) {

+    size_t width_tmp = (size_t)(width);

+    uint32* d = (uint32*)(dst);

+    asm volatile (

+      "rep stosl " MEMSTORESTRING(eax,0) "     \n"

+      : "+D"(d),         // %0

+        "+c"(width_tmp)  // %1

+      : "a"(v32)         // %2

+      : "memory", "cc");

+    dst += dst_stride;

+  }

+}

+#endif  // HAS_SETROW_X86

+#ifdef HAS_YUY2TOYROW_SSE2

+void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_yuy2),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2

+    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_yuy2),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  : "r"((intptr_t)(stride_yuy2))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

+                         uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_yuy2),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,

+                               uint8* dst_y, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_yuy2),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,

+                                int stride_yuy2,

+                                uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2

+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_yuy2),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  : "r"((intptr_t)(stride_yuy2))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,

+                                   uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_yuy2),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_uyvy),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2

+    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_uyvy),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  : "r"((intptr_t)(stride_uyvy))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_uyvy),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,

+                               uint8* dst_y, int pix) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_uyvy),  // %0

+    "+r"(dst_y),     // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,

+                                uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2

+    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_uyvy),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  : "r"((intptr_t)(stride_uyvy))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,

+                                   uint8* dst_u, uint8* dst_v, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    "sub       %1,%2                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_uyvy),    // %0

+    "+r"(dst_u),       // %1

+    "+r"(dst_v),       // %2

+    "+r"(pix)          // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_YUY2TOYROW_SSE2

+#ifdef HAS_ARGBBLENDROW_SSE2

+// Blend 8 pixels at a time.

+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                       uint8* dst_argb, int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"

+    "psrlw     $0xf,%%xmm7                     \n"

+    "pcmpeqb   %%xmm6,%%xmm6                   \n"

+    "psrlw     $0x8,%%xmm6                     \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psllw     $0x8,%%xmm5                     \n"

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "pslld     $0x18,%%xmm4                    \n"

+    "sub       $0x1,%3                         \n"

+    "je        91f                             \n"

+    "jl        99f                             \n"

+    // 1 pixel loop until destination pointer is aligned.

+  "10:                                         \n"

+    "test      $0xf,%2                         \n"

+    "je        19f                             \n"

+    "movd      " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    "movdqa    %%xmm3,%%xmm0                   \n"

+    "pxor      %%xmm4,%%xmm3                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm2         \n"

+    "psrlw     $0x8,%%xmm3                     \n"

+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"

+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"

+    "pand      %%xmm6,%%xmm2                   \n"

+    "paddw     %%xmm7,%%xmm3                   \n"

+    "pmullw    %%xmm3,%%xmm2                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x4,1) ",%1            \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm2                     \n"

+    "paddusb   %%xmm2,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x1,%3                         \n"

+    "movd      %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x4,2) ",%2            \n"

+    "jge       10b                             \n"

+  "19:                                         \n"

+    "add       $1-4,%3                         \n"

+    "jl        49f                             \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "41:                                         \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm3,%%xmm0                   \n"

+    "pxor      %%xmm4,%%xmm3                   \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"

+    "psrlw     $0x8,%%xmm3                     \n"

+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"

+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"

+    "pand      %%xmm6,%%xmm2                   \n"

+    "paddw     %%xmm7,%%xmm3                   \n"

+    "pmullw    %%xmm3,%%xmm2                   \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm2                     \n"

+    "paddusb   %%xmm2,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jge       41b                             \n"

+  "49:                                         \n"

+    "add       $0x3,%3                         \n"

+    "jl        99f                             \n"

+    // 1 pixel loop.

+  "91:                                         \n"

+    "movd      " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    "movdqa    %%xmm3,%%xmm0                   \n"

+    "pxor      %%xmm4,%%xmm3                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm2         \n"

+    "psrlw     $0x8,%%xmm3                     \n"

+    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"

+    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"

+    "pand      %%xmm6,%%xmm2                   \n"

+    "paddw     %%xmm7,%%xmm3                   \n"

+    "pmullw    %%xmm3,%%xmm2                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x4,1) ",%1            \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm2                     \n"

+    "paddusb   %%xmm2,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x1,%3                         \n"

+    "movd      %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x4,2) ",%2            \n"

+    "jge       91b                             \n"

+  "99:                                         \n"

+  : "+r"(src_argb0),    // %0

+    "+r"(src_argb1),    // %1

+    "+r"(dst_argb),     // %2

+    "+r"(width)         // %3

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_ARGBBLENDROW_SSE2

+#ifdef HAS_ARGBBLENDROW_SSSE3

+// Shuffle table for isolating alpha.

+static uvec8 kShuffleAlpha = {

+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,

+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

+};

+// Blend 8 pixels at a time

+// Shuffle table for reversing the bytes.

+// Same as SSE2, but replaces

+//    psrlw      xmm3, 8          // alpha

+//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words

+//    pshuflw    xmm3, xmm3,0F5h

+// with..

+//    pshufb     xmm3, kShuffleAlpha // alpha

+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

+                        uint8* dst_argb, int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"

+    "psrlw     $0xf,%%xmm7                     \n"

+    "pcmpeqb   %%xmm6,%%xmm6                   \n"

+    "psrlw     $0x8,%%xmm6                     \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psllw     $0x8,%%xmm5                     \n"

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "pslld     $0x18,%%xmm4                    \n"

+    "sub       $0x1,%3                         \n"

+    "je        91f                             \n"

+    "jl        99f                             \n"

+    // 1 pixel loop until destination pointer is aligned.

+  "10:                                         \n"

+    "test      $0xf,%2                         \n"

+    "je        19f                             \n"

+    "movd      " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    "movdqa    %%xmm3,%%xmm0                   \n"

+    "pxor      %%xmm4,%%xmm3                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm2         \n"

+    "pshufb    %4,%%xmm3                       \n"

+    "pand      %%xmm6,%%xmm2                   \n"

+    "paddw     %%xmm7,%%xmm3                   \n"

+    "pmullw    %%xmm3,%%xmm2                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x4,1) ",%1            \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm2                     \n"

+    "paddusb   %%xmm2,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x1,%3                         \n"

+    "movd      %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x4,2) ",%2            \n"

+    "jge       10b                             \n"

+  "19:                                         \n"

+    "add       $1-4,%3                         \n"

+    "jl        49f                             \n"

+    "test      $0xf,%0                         \n"

+    "jne       41f                             \n"

+    "test      $0xf,%1                         \n"

+    "jne       41f                             \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "40:                                         \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm3,%%xmm0                   \n"

+    "pxor      %%xmm4,%%xmm3                   \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"

+    "pshufb    %4,%%xmm3                       \n"

+    "pand      %%xmm6,%%xmm2                   \n"

+    "paddw     %%xmm7,%%xmm3                   \n"

+    "pmullw    %%xmm3,%%xmm2                   \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm2                     \n"

+    "paddusb   %%xmm2,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jge       40b                             \n"

+    "jmp       49f                             \n"

+    // 4 pixel unaligned loop.

+    LABELALIGN

+  "41:                                         \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm3,%%xmm0                   \n"

+    "pxor      %%xmm4,%%xmm3                   \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"

+    "pshufb    %4,%%xmm3                       \n"

+    "pand      %%xmm6,%%xmm2                   \n"

+    "paddw     %%xmm7,%%xmm3                   \n"

+    "pmullw    %%xmm3,%%xmm2                   \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm2                     \n"

+    "paddusb   %%xmm2,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jge       41b                             \n"

+  "49:                                         \n"

+    "add       $0x3,%3                         \n"

+    "jl        99f                             \n"

+    // 1 pixel loop.

+  "91:                                         \n"

+    "movd      " MEMACCESS(0) ",%%xmm3         \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    "movdqa    %%xmm3,%%xmm0                   \n"

+    "pxor      %%xmm4,%%xmm3                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm2         \n"

+    "pshufb    %4,%%xmm3                       \n"

+    "pand      %%xmm6,%%xmm2                   \n"

+    "paddw     %%xmm7,%%xmm3                   \n"

+    "pmullw    %%xmm3,%%xmm2                   \n"

+    "movd      " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x4,1) ",%1            \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "por       %%xmm4,%%xmm0                   \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm2                     \n"

+    "paddusb   %%xmm2,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x1,%3                         \n"

+    "movd      %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x4,2) ",%2            \n"

+    "jge       91b                             \n"

+  "99:                                         \n"

+  : "+r"(src_argb0),    // %0

+    "+r"(src_argb1),    // %1

+    "+r"(dst_argb),     // %2

+    "+r"(width)         // %3

+  : "m"(kShuffleAlpha)  // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_ARGBBLENDROW_SSSE3

+#ifdef HAS_ARGBATTENUATEROW_SSE2

+// Attenuate 4 pixels at a time.

+// aligned to 16 bytes

+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "pslld     $0x18,%%xmm4                    \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrld     $0x8,%%xmm5                     \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"

+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"

+    "pmulhuw   %%xmm2,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"

+    "punpckhbw %%xmm1,%%xmm1                   \n"

+    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"

+    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"

+    "pmulhuw   %%xmm2,%%xmm1                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "pand      %%xmm4,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "por       %%xmm2,%%xmm0                   \n"

+    "sub       $0x4,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),    // %0

+    "+r"(dst_argb),    // %1

+    "+r"(width)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBATTENUATEROW_SSE2

+#ifdef HAS_ARGBATTENUATEROW_SSSE3

+// Shuffle table duplicating alpha

+static uvec8 kShuffleAlpha0 = {

+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,

+};

+static uvec8 kShuffleAlpha1 = {

+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,

+};

+// Attenuate 4 pixels at a time.

+// aligned to 16 bytes

+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

+  asm volatile (

+    "pcmpeqb   %%xmm3,%%xmm3                   \n"

+    "pslld     $0x18,%%xmm3                    \n"

+    "movdqa    %3,%%xmm4                       \n"

+    "movdqa    %4,%%xmm5                       \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "pshufb    %%xmm4,%%xmm0                   \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

+    "punpcklbw %%xmm1,%%xmm1                   \n"

+    "pmulhuw   %%xmm1,%%xmm0                   \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

+    "pshufb    %%xmm5,%%xmm1                   \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

+    "punpckhbw %%xmm2,%%xmm2                   \n"

+    "pmulhuw   %%xmm2,%%xmm1                   \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "pand      %%xmm3,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "por       %%xmm2,%%xmm0                   \n"

+    "sub       $0x4,%2                         \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),    // %0

+    "+r"(dst_argb),    // %1

+    "+r"(width)        // %2

+  : "m"(kShuffleAlpha0),  // %3

+    "m"(kShuffleAlpha1)  // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBATTENUATEROW_SSSE3

+#ifdef HAS_ARGBUNATTENUATEROW_SSE2

+// Unattenuate 4 pixels at a time.

+// aligned to 16 bytes

+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,

+                             int width) {

+  uintptr_t alpha = 0;

+  asm volatile (

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2

+    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"

+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3

+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

+    "movlhps   %%xmm3,%%xmm2                   \n"

+    "pmulhuw   %%xmm2,%%xmm0                   \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

+    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"

+    "punpckhbw %%xmm1,%%xmm1                   \n"

+    BUNDLEALIGN

+    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2

+    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"

+    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3

+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

+    "movlhps   %%xmm3,%%xmm2                   \n"

+    "pmulhuw   %%xmm2,%%xmm1                   \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%2                         \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),    // %0

+    "+r"(dst_argb),    // %1

+    "+r"(width),       // %2

+    "+r"(alpha)        // %3

+  : "r"(fixed_invtbl8)  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBUNATTENUATEROW_SSE2

+#ifdef HAS_ARGBGRAYROW_SSSE3

+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels

+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

+  asm volatile (

+    "movdqa    %3,%%xmm4                       \n"

+    "movdqa    %4,%%xmm5                       \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "pmaddubsw %%xmm4,%%xmm0                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "phaddw    %%xmm1,%%xmm0                   \n"

+    "paddw     %%xmm5,%%xmm0                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrld     $0x18,%%xmm2                    \n"

+    "psrld     $0x18,%%xmm3                    \n"

+    "packuswb  %%xmm3,%%xmm2                   \n"

+    "packuswb  %%xmm2,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm3                   \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "punpcklbw %%xmm2,%%xmm3                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm3,%%xmm0                   \n"

+    "punpckhwd %%xmm3,%%xmm1                   \n"

+    "sub       $0x8,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_argb),   // %1

+    "+r"(width)       // %2

+  : "m"(kARGBToYJ),   // %3

+    "m"(kAddYJ64)     // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBGRAYROW_SSSE3

+#ifdef HAS_ARGBSEPIAROW_SSSE3

+//    b = (r * 35 + g * 68 + b * 17) >> 7

+//    g = (r * 45 + g * 88 + b * 22) >> 7

+//    r = (r * 50 + g * 98 + b * 24) >> 7

+// Constant for ARGB color to sepia tone

+static vec8 kARGBToSepiaB = {

+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0

+};

+static vec8 kARGBToSepiaG = {

+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0

+};

+static vec8 kARGBToSepiaR = {

+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0

+};

+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {

+  asm volatile (

+    "movdqa    %2,%%xmm2                       \n"

+    "movdqa    %3,%%xmm3                       \n"

+    "movdqa    %4,%%xmm4                       \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

+    "pmaddubsw %%xmm2,%%xmm0                   \n"

+    "pmaddubsw %%xmm2,%%xmm6                   \n"

+    "phaddw    %%xmm6,%%xmm0                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "pmaddubsw %%xmm3,%%xmm5                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "phaddw    %%xmm1,%%xmm5                   \n"

+    "psrlw     $0x7,%%xmm5                     \n"

+    "packuswb  %%xmm5,%%xmm5                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "pmaddubsw %%xmm4,%%xmm5                   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "phaddw    %%xmm1,%%xmm5                   \n"

+    "psrlw     $0x7,%%xmm5                     \n"

+    "packuswb  %%xmm5,%%xmm5                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "psrld     $0x18,%%xmm6                    \n"

+    "psrld     $0x18,%%xmm1                    \n"

+    "packuswb  %%xmm1,%%xmm6                   \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "punpcklbw %%xmm6,%%xmm5                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklwd %%xmm5,%%xmm0                   \n"

+    "punpckhwd %%xmm5,%%xmm1                   \n"

+    "sub       $0x8,%1                         \n"

+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "jg        1b                              \n"

+  : "+r"(dst_argb),      // %0

+    "+r"(width)          // %1

+  : "m"(kARGBToSepiaB),  // %2

+    "m"(kARGBToSepiaG),  // %3

+    "m"(kARGBToSepiaR)   // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+#endif  // HAS_ARGBSEPIAROW_SSSE3

+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3

+// Tranform 8 ARGB pixels (32 bytes) with color matrix.

+// Same as Sepia except matrix is provided.

+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                              const int8* matrix_argb, int width) {

+  asm volatile (

+    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"

+    "pshufd    $0x00,%%xmm5,%%xmm2             \n"

+    "pshufd    $0x55,%%xmm5,%%xmm3             \n"

+    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"

+    "pshufd    $0xff,%%xmm5,%%xmm5             \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"

+    "pmaddubsw %%xmm2,%%xmm0                   \n"

+    "pmaddubsw %%xmm2,%%xmm7                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "pmaddubsw %%xmm3,%%xmm6                   \n"

+    "pmaddubsw %%xmm3,%%xmm1                   \n"

+    "phaddsw   %%xmm7,%%xmm0                   \n"

+    "phaddsw   %%xmm1,%%xmm6                   \n"

+    "psraw     $0x6,%%xmm0                     \n"

+    "psraw     $0x6,%%xmm6                     \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "punpcklbw %%xmm6,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"

+    "pmaddubsw %%xmm4,%%xmm1                   \n"

+    "pmaddubsw %%xmm4,%%xmm7                   \n"

+    "phaddsw   %%xmm7,%%xmm1                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"

+    "pmaddubsw %%xmm5,%%xmm6                   \n"

+    "pmaddubsw %%xmm5,%%xmm7                   \n"

+    "phaddsw   %%xmm7,%%xmm6                   \n"

+    "psraw     $0x6,%%xmm1                     \n"

+    "psraw     $0x6,%%xmm6                     \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "punpcklbw %%xmm6,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm6                   \n"

+    "punpcklwd %%xmm1,%%xmm0                   \n"

+    "punpckhwd %%xmm1,%%xmm6                   \n"

+    "sub       $0x8,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),      // %0

+    "+r"(dst_argb),      // %1

+    "+r"(width)          // %2

+  : "r"(matrix_argb)     // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3

+#ifdef HAS_ARGBQUANTIZEROW_SSE2

+// Quantize 4 ARGB pixels (16 bytes).

+// aligned to 16 bytes

+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,

+                          int interval_offset, int width) {

+  asm volatile (

+    "movd      %2,%%xmm2                       \n"

+    "movd      %3,%%xmm3                       \n"

+    "movd      %4,%%xmm4                       \n"

+    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

+    "pshufd    $0x44,%%xmm2,%%xmm2             \n"

+    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

+    "pshufd    $0x44,%%xmm3,%%xmm3             \n"

+    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"

+    "pshufd    $0x44,%%xmm4,%%xmm4             \n"

+    "pxor      %%xmm5,%%xmm5                   \n"

+    "pcmpeqb   %%xmm6,%%xmm6                   \n"

+    "pslld     $0x18,%%xmm6                    \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "pmulhuw   %%xmm2,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"

+    "punpckhbw %%xmm5,%%xmm1                   \n"

+    "pmulhuw   %%xmm2,%%xmm1                   \n"

+    "pmullw    %%xmm3,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm7         \n"

+    "pmullw    %%xmm3,%%xmm1                   \n"

+    "pand      %%xmm6,%%xmm7                   \n"

+    "paddw     %%xmm4,%%xmm0                   \n"

+    "paddw     %%xmm4,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "por       %%xmm7,%%xmm0                   \n"

+    "sub       $0x4,%1                         \n"

+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "jg        1b                              \n"

+  : "+r"(dst_argb),       // %0

+    "+r"(width)           // %1

+  : "r"(scale),           // %2

+    "r"(interval_size),   // %3

+    "r"(interval_offset)  // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_ARGBQUANTIZEROW_SSE2

+#ifdef HAS_ARGBSHADEROW_SSE2

+// Shade 4 pixels at a time by specified value.

+// Aligned to 16 bytes.

+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,

+                       uint32 value) {

+  asm volatile (

+    "movd      %3,%%xmm2                       \n"

+    "punpcklbw %%xmm2,%%xmm2                   \n"

+    "punpcklqdq %%xmm2,%%xmm2                  \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "punpckhbw %%xmm1,%%xmm1                   \n"

+    "pmulhuw   %%xmm2,%%xmm0                   \n"

+    "pmulhuw   %%xmm2,%%xmm1                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(width)      // %2

+  : "r"(value)       // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2"

+#endif

+  );

+}

+#endif  // HAS_ARGBSHADEROW_SSE2

+#ifdef HAS_ARGBMULTIPLYROW_SSE2

+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.

+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  asm volatile (

+    "pxor      %%xmm5,%%xmm5                   \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "movdqu    %%xmm0,%%xmm1                   \n"

+    "movdqu    %%xmm2,%%xmm3                   \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "punpckhbw %%xmm1,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "punpckhbw %%xmm5,%%xmm3                   \n"

+    "pmulhuw   %%xmm2,%%xmm0                   \n"

+    "pmulhuw   %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBMULTIPLYROW_SSE2

+#ifdef HAS_ARGBADDROW_SSE2

+// Add 2 rows of ARGB pixels together, 4 pixels at a time.

+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+#endif  // HAS_ARGBADDROW_SSE2

+#ifdef HAS_ARGBSUBTRACTROW_SSE2

+// Subtract 2 rows of ARGB pixels, 4 pixels at a time.

+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  asm volatile (

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "psubusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb0),  // %0

+    "+r"(src_argb1),  // %1

+    "+r"(dst_argb),   // %2

+    "+r"(width)       // %3

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+#endif  // HAS_ARGBSUBTRACTROW_SSE2

+#ifdef HAS_SOBELXROW_SSE2

+// SobelX as a matrix is

+// -1  0  1

+// -2  0  2

+// -1  0  1

+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,

+                    const uint8* src_y2, uint8* dst_sobelx, int width) {

+  asm volatile (

+    "sub       %0,%1                           \n"

+    "sub       %0,%2                           \n"

+    "sub       %0,%3                           \n"

+    "pxor      %%xmm5,%%xmm5                   \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm1                   \n"

+    "psubw     %%xmm1,%%xmm0                   \n"

+    BUNDLEALIGN

+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1

+    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2

+    "punpcklbw %%xmm5,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "psubw     %%xmm2,%%xmm1                   \n"

+    BUNDLEALIGN

+    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2

+    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "punpcklbw %%xmm5,%%xmm3                   \n"

+    "psubw     %%xmm3,%%xmm2                   \n"

+    "paddw     %%xmm2,%%xmm0                   \n"

+    "paddw     %%xmm1,%%xmm0                   \n"

+    "paddw     %%xmm1,%%xmm0                   \n"

+    "pxor      %%xmm1,%%xmm1                   \n"

+    "psubw     %%xmm0,%%xmm1                   \n"

+    "pmaxsw    %%xmm1,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "sub       $0x8,%4                         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "jg        1b                              \n"

+  : "+r"(src_y0),      // %0

+    "+r"(src_y1),      // %1

+    "+r"(src_y2),      // %2

+    "+r"(dst_sobelx),  // %3

+    "+r"(width)        // %4

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_SOBELXROW_SSE2

+#ifdef HAS_SOBELYROW_SSE2

+// SobelY as a matrix is

+// -1 -2 -1

+//  0  0  0

+//  1  2  1

+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,

+                    uint8* dst_sobely, int width) {

+  asm volatile (

+    "sub       %0,%1                           \n"

+    "sub       %0,%2                           \n"

+    "pxor      %%xmm5,%%xmm5                   \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm1                   \n"

+    "psubw     %%xmm1,%%xmm0                   \n"

+    BUNDLEALIGN

+    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"

+    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2

+    "punpcklbw %%xmm5,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "psubw     %%xmm2,%%xmm1                   \n"

+    BUNDLEALIGN

+    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"

+    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3

+    "punpcklbw %%xmm5,%%xmm2                   \n"

+    "punpcklbw %%xmm5,%%xmm3                   \n"

+    "psubw     %%xmm3,%%xmm2                   \n"

+    "paddw     %%xmm2,%%xmm0                   \n"

+    "paddw     %%xmm1,%%xmm0                   \n"

+    "paddw     %%xmm1,%%xmm0                   \n"

+    "pxor      %%xmm1,%%xmm1                   \n"

+    "psubw     %%xmm0,%%xmm1                   \n"

+    "pmaxsw    %%xmm1,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "sub       $0x8,%3                         \n"

+    BUNDLEALIGN

+    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "jg        1b                              \n"

+  : "+r"(src_y0),      // %0

+    "+r"(src_y1),      // %1

+    "+r"(dst_sobely),  // %2

+    "+r"(width)        // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_SOBELYROW_SSE2

+#ifdef HAS_SOBELROW_SSE2

+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.

+// A = 255

+// R = Sobel

+// G = Sobel

+// B = Sobel

+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                   uint8* dst_argb, int width) {

+  asm volatile (

+    "sub       %0,%1                           \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pslld     $0x18,%%xmm5                    \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "punpcklbw %%xmm0,%%xmm2                   \n"

+    "punpckhbw %%xmm0,%%xmm0                   \n"

+    "movdqa    %%xmm2,%%xmm1                   \n"

+    "punpcklwd %%xmm2,%%xmm1                   \n"

+    "punpckhwd %%xmm2,%%xmm2                   \n"

+    "por       %%xmm5,%%xmm1                   \n"

+    "por       %%xmm5,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm3                   \n"

+    "punpcklwd %%xmm0,%%xmm3                   \n"

+    "punpckhwd %%xmm0,%%xmm0                   \n"

+    "por       %%xmm5,%%xmm3                   \n"

+    "por       %%xmm5,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movdqa    %%xmm1," MEMACCESS(2) "         \n"

+    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"

+    "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"

+    "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"

+    "lea       " MEMLEA(0x40,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_argb),    // %2

+    "+r"(width)        // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_SOBELROW_SSE2

+#ifdef HAS_SOBELTOPLANEROW_SSE2

+// Adds Sobel X and Sobel Y and stores Sobel into a plane.

+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                          uint8* dst_y, int width) {

+  asm volatile (

+    "sub       %0,%1                           \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "pslld     $0x18,%%xmm5                    \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%3                        \n"

+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_y),       // %2

+    "+r"(width)        // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+#endif  // HAS_SOBELTOPLANEROW_SSE2

+#ifdef HAS_SOBELXYROW_SSE2

+// Mixes Sobel X, Sobel Y and Sobel into ARGB.

+// A = 255

+// R = Sobel X

+// G = Sobel

+// B = Sobel Y

+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width) {

+  asm volatile (

+    "sub       %0,%1                           \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    // 8 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "paddusb   %%xmm1,%%xmm2                   \n"

+    "movdqa    %%xmm0,%%xmm3                   \n"

+    "punpcklbw %%xmm5,%%xmm3                   \n"

+    "punpckhbw %%xmm5,%%xmm0                   \n"

+    "movdqa    %%xmm1,%%xmm4                   \n"

+    "punpcklbw %%xmm2,%%xmm4                   \n"

+    "punpckhbw %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm4,%%xmm6                   \n"

+    "punpcklwd %%xmm3,%%xmm6                   \n"

+    "punpckhwd %%xmm3,%%xmm4                   \n"

+    "movdqa    %%xmm1,%%xmm7                   \n"

+    "punpcklwd %%xmm0,%%xmm7                   \n"

+    "punpckhwd %%xmm0,%%xmm1                   \n"

+    "sub       $0x10,%3                        \n"

+    "movdqa    %%xmm6," MEMACCESS(2) "         \n"

+    "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"

+    "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"

+    "lea       " MEMLEA(0x40,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_sobelx),  // %0

+    "+r"(src_sobely),  // %1

+    "+r"(dst_argb),    // %2

+    "+r"(width)        // %3

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_SOBELXYROW_SSE2

+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2

+// Creates a table of cumulative sums where each value is a sum of all values

+// above and to the left of the value, inclusive of the value.

+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,

+                                  const int32* previous_cumsum, int width) {

+  asm volatile (

+    "pxor      %%xmm0,%%xmm0                   \n"

+    "pxor      %%xmm1,%%xmm1                   \n"

+    "sub       $0x4,%3                         \n"

+    "jl        49f                             \n"

+    "test      $0xf,%1                         \n"

+    "jne       49f                             \n"

+  // 4 pixel loop                              \n"

+    LABELALIGN

+  "40:                                         \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm2,%%xmm4                   \n"

+    "punpcklbw %%xmm1,%%xmm2                   \n"

+    "movdqa    %%xmm2,%%xmm3                   \n"

+    "punpcklwd %%xmm1,%%xmm2                   \n"

+    "punpckhwd %%xmm1,%%xmm3                   \n"

+    "punpckhbw %%xmm1,%%xmm4                   \n"

+    "movdqa    %%xmm4,%%xmm5                   \n"

+    "punpcklwd %%xmm1,%%xmm4                   \n"

+    "punpckhwd %%xmm1,%%xmm5                   \n"

+    "paddd     %%xmm2,%%xmm0                   \n"

+    "movdqa    " MEMACCESS(2) ",%%xmm2         \n"

+    "paddd     %%xmm0,%%xmm2                   \n"

+    "paddd     %%xmm3,%%xmm0                   \n"

+    "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"

+    "paddd     %%xmm0,%%xmm3                   \n"

+    "paddd     %%xmm4,%%xmm0                   \n"

+    "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"

+    "paddd     %%xmm0,%%xmm4                   \n"

+    "paddd     %%xmm5,%%xmm0                   \n"

+    "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"

+    "lea       " MEMLEA(0x40,2) ",%2           \n"

+    "paddd     %%xmm0,%%xmm5                   \n"

+    "movdqa    %%xmm2," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"

+    "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"

+    "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "sub       $0x4,%3                         \n"

+    "jge       40b                             \n"

+  "49:                                         \n"

+    "add       $0x3,%3                         \n"

+    "jl        19f                             \n"

+  // 1 pixel loop                              \n"

+    LABELALIGN

+  "10:                                         \n"

+    "movd      " MEMACCESS(0) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    "punpcklbw %%xmm1,%%xmm2                   \n"

+    "punpcklwd %%xmm1,%%xmm2                   \n"

+    "paddd     %%xmm2,%%xmm0                   \n"

+    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "paddd     %%xmm0,%%xmm2                   \n"

+    "movdqu    %%xmm2," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x1,%3                         \n"

+    "jge       10b                             \n"

+  "19:                                         \n"

+  : "+r"(row),  // %0

+    "+r"(cumsum),  // %1

+    "+r"(previous_cumsum),  // %2

+    "+r"(width)  // %3

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2

+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,

+                                    int width, int area, uint8* dst,

+                                    int count) {

+  asm volatile (

+    "movd      %5,%%xmm5                       \n"

+    "cvtdq2ps  %%xmm5,%%xmm5                   \n"

+    "rcpss     %%xmm5,%%xmm4                   \n"

+    "pshufd    $0x0,%%xmm4,%%xmm4              \n"

+    "sub       $0x4,%3                         \n"

+    "jl        49f                             \n"

+    "cmpl      $0x80,%5                        \n"

+    "ja        40f                             \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    "pcmpeqb   %%xmm6,%%xmm6                   \n"

+    "psrld     $0x10,%%xmm6                    \n"

+    "cvtdq2ps  %%xmm6,%%xmm6                   \n"

+    "addps     %%xmm6,%%xmm5                   \n"

+    "mulps     %%xmm4,%%xmm5                   \n"

+    "cvtps2dq  %%xmm5,%%xmm5                   \n"

+    "packssdw  %%xmm5,%%xmm5                   \n"

+  // 4 pixel small loop                        \n"

+    LABELALIGN

+  "4:                                         \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    BUNDLEALIGN

+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0

+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1

+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2

+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"

+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"

+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"

+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"

+    BUNDLEALIGN

+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0

+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1

+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2

+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "packssdw  %%xmm1,%%xmm0                   \n"

+    "packssdw  %%xmm3,%%xmm2                   \n"

+    "pmulhuw   %%xmm5,%%xmm0                   \n"

+    "pmulhuw   %%xmm5,%%xmm2                   \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "sub       $0x4,%3                         \n"

+    "jge       4b                              \n"

+    "jmp       49f                             \n"

+  // 4 pixel loop                              \n"

+    LABELALIGN

+  "40:                                         \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

+    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

+    BUNDLEALIGN

+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0

+    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1

+    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2

+    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"

+    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"

+    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"

+    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"

+    BUNDLEALIGN

+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0

+    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1

+    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2

+    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"

+    "cvtdq2ps  %%xmm1,%%xmm1                   \n"

+    "mulps     %%xmm4,%%xmm0                   \n"

+    "mulps     %%xmm4,%%xmm1                   \n"

+    "cvtdq2ps  %%xmm2,%%xmm2                   \n"

+    "cvtdq2ps  %%xmm3,%%xmm3                   \n"

+    "mulps     %%xmm4,%%xmm2                   \n"

+    "mulps     %%xmm4,%%xmm3                   \n"

+    "cvtps2dq  %%xmm0,%%xmm0                   \n"

+    "cvtps2dq  %%xmm1,%%xmm1                   \n"

+    "cvtps2dq  %%xmm2,%%xmm2                   \n"

+    "cvtps2dq  %%xmm3,%%xmm3                   \n"

+    "packssdw  %%xmm1,%%xmm0                   \n"

+    "packssdw  %%xmm3,%%xmm2                   \n"

+    "packuswb  %%xmm2,%%xmm0                   \n"

+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "sub       $0x4,%3                         \n"

+    "jge       40b                             \n"

+  "49:                                         \n"

+    "add       $0x3,%3                         \n"

+    "jl        19f                             \n"

+  // 1 pixel loop                              \n"

+    LABELALIGN

+  "10:                                         \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "psubd     " MEMACCESS(1) ",%%xmm0         \n"

+    BUNDLEALIGN

+    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"

+    "mulps     %%xmm4,%%xmm0                   \n"

+    "cvtps2dq  %%xmm0,%%xmm0                   \n"

+    "packssdw  %%xmm0,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movd      %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x4,2) ",%2            \n"

+    "sub       $0x1,%3                         \n"

+    "jge       10b                             \n"

+  "19:                                         \n"

+  : "+r"(topleft),  // %0

+    "+r"(botleft),  // %1

+    "+r"(dst),      // %2

+    "+rm"(count)    // %3

+  : "r"((intptr_t)(width)),  // %4

+    "rm"(area)     // %5

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

+#ifdef HAS_ARGBAFFINEROW_SSE2

+// Copy ARGB pixels from source image with slope to a row of destination.

+LIBYUV_API

+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

+                        uint8* dst_argb, const float* src_dudv, int width) {

+  intptr_t src_argb_stride_temp = src_argb_stride;

+  intptr_t temp = 0;

+  asm volatile (

+    "movq      " MEMACCESS(3) ",%%xmm2         \n"

+    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"

+    "shl       $0x10,%1                        \n"

+    "add       $0x4,%1                         \n"

+    "movd      %1,%%xmm5                       \n"

+    "sub       $0x4,%4                         \n"

+    "jl        49f                             \n"

+    "pshufd    $0x44,%%xmm7,%%xmm7             \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    "movdqa    %%xmm2,%%xmm0                   \n"

+    "addps     %%xmm7,%%xmm0                   \n"

+    "movlhps   %%xmm0,%%xmm2                   \n"

+    "movdqa    %%xmm7,%%xmm4                   \n"

+    "addps     %%xmm4,%%xmm4                   \n"

+    "movdqa    %%xmm2,%%xmm3                   \n"

+    "addps     %%xmm4,%%xmm3                   \n"

+    "addps     %%xmm4,%%xmm4                   \n"

+  // 4 pixel loop                              \n"

+    LABELALIGN

+  "40:                                         \n"

+    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2

+    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2

+    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts

+    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride

+    "movd      %%xmm0,%k1                      \n"

+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+    "movd      %%xmm0,%k5                      \n"

+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+    BUNDLEALIGN

+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1

+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6

+    "punpckldq %%xmm6,%%xmm1                   \n"

+    "addps     %%xmm4,%%xmm2                   \n"

+    "movq      %%xmm1," MEMACCESS(2) "         \n"

+    "movd      %%xmm0,%k1                      \n"

+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+    "movd      %%xmm0,%k5                      \n"

+    BUNDLEALIGN

+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0

+    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6

+    "punpckldq %%xmm6,%%xmm0                   \n"

+    "addps     %%xmm4,%%xmm3                   \n"

+    "sub       $0x4,%4                         \n"

+    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jge       40b                             \n"

+  "49:                                         \n"

+    "add       $0x3,%4                         \n"

+    "jl        19f                             \n"

+  // 1 pixel loop                              \n"

+    LABELALIGN

+  "10:                                         \n"

+    "cvttps2dq %%xmm2,%%xmm0                   \n"

+    "packssdw  %%xmm0,%%xmm0                   \n"

+    "pmaddwd   %%xmm5,%%xmm0                   \n"

+    "addps     %%xmm7,%%xmm2                   \n"

+    "movd      %%xmm0,%k1                      \n"

+    BUNDLEALIGN

+    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0

+    "sub       $0x1,%4                         \n"

+    "movd      %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x04,2) ",%2           \n"

+    "jge       10b                             \n"

+  "19:                                         \n"

+  : "+r"(src_argb),  // %0

+    "+r"(src_argb_stride_temp),  // %1

+    "+r"(dst_argb),  // %2

+    "+r"(src_dudv),  // %3

+    "+rm"(width),    // %4

+    "+r"(temp)   // %5

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_ARGBAFFINEROW_SSE2

+#ifdef HAS_INTERPOLATEROW_SSSE3

+// Bilinear filter 16x2 -> 16x1

+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                          ptrdiff_t src_stride, int dst_width,

+                          int source_y_fraction) {

+  asm volatile (

+    "sub       %1,%0                           \n"

+    "shr       %3                              \n"

+    "cmp       $0x0,%3                         \n"

+    "je        100f                            \n"

+    "cmp       $0x20,%3                        \n"

+    "je        75f                             \n"

+    "cmp       $0x40,%3                        \n"

+    "je        50f                             \n"

+    "cmp       $0x60,%3                        \n"

+    "je        25f                             \n"

+    "movd      %3,%%xmm0                       \n"

+    "neg       %3                              \n"

+    "add       $0x80,%3                        \n"

+    "movd      %3,%%xmm5                       \n"

+    "punpcklbw %%xmm0,%%xmm5                   \n"

+    "punpcklwd %%xmm5,%%xmm5                   \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    // General purpose row blend.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm2)

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm2,%%xmm0                   \n"

+    "punpckhbw %%xmm2,%%xmm1                   \n"

+    "pmaddubsw %%xmm5,%%xmm0                   \n"

+    "pmaddubsw %%xmm5,%%xmm1                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+    "jmp       99f                             \n"

+    // Blend 25 / 75.

+    LABELALIGN

+  "25:                                         \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        25b                             \n"

+    "jmp       99f                             \n"

+    // Blend 50 / 50.

+    LABELALIGN

+  "50:                                         \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        50b                             \n"

+    "jmp       99f                             \n"

+    // Blend 75 / 25.

+    LABELALIGN

+  "75:                                         \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm0)

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        75b                             \n"

+    "jmp       99f                             \n"

+    // Blend 100 / 0 - Copy row unchanged.

+    LABELALIGN

+  "100:                                        \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    "sub       $0x10,%2                        \n"

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        100b                            \n"

+  "99:                                         \n"

+  : "+r"(dst_ptr),    // %0

+    "+r"(src_ptr),    // %1

+    "+r"(dst_width),  // %2

+    "+r"(source_y_fraction)  // %3

+  : "r"((intptr_t)(src_stride))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_INTERPOLATEROW_SSSE3

+#ifdef HAS_INTERPOLATEROW_SSE2

+// Bilinear filter 16x2 -> 16x1

+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) {

+  asm volatile (

+    "sub       %1,%0                           \n"

+    "shr       %3                              \n"

+    "cmp       $0x0,%3                         \n"

+    "je        100f                            \n"

+    "cmp       $0x20,%3                        \n"

+    "je        75f                             \n"

+    "cmp       $0x40,%3                        \n"

+    "je        50f                             \n"

+    "cmp       $0x60,%3                        \n"

+    "je        25f                             \n"

+    "movd      %3,%%xmm0                       \n"

+    "neg       %3                              \n"

+    "add       $0x80,%3                        \n"

+    "movd      %3,%%xmm5                       \n"

+    "punpcklbw %%xmm0,%%xmm5                   \n"

+    "punpcklwd %%xmm5,%%xmm5                   \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    // General purpose row blend.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm2,%%xmm3                   \n"

+    "punpcklbw %%xmm4,%%xmm2                   \n"

+    "punpckhbw %%xmm4,%%xmm3                   \n"

+    "punpcklbw %%xmm4,%%xmm0                   \n"

+    "punpckhbw %%xmm4,%%xmm1                   \n"

+    "psubw     %%xmm0,%%xmm2                   \n"

+    "psubw     %%xmm1,%%xmm3                   \n"

+    "paddw     %%xmm2,%%xmm2                   \n"

+    "paddw     %%xmm3,%%xmm3                   \n"

+    "pmulhw    %%xmm5,%%xmm2                   \n"

+    "pmulhw    %%xmm5,%%xmm3                   \n"

+    "paddw     %%xmm2,%%xmm0                   \n"

+    "paddw     %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+    "jmp       99f                             \n"

+    // Blend 25 / 75.

+    LABELALIGN

+  "25:                                         \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        25b                             \n"

+    "jmp       99f                             \n"

+    // Blend 50 / 50.

+    LABELALIGN

+  "50:                                         \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        50b                             \n"

+    "jmp       99f                             \n"

+    // Blend 75 / 25.

+    LABELALIGN

+  "75:                                         \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"

+    MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        75b                             \n"

+    "jmp       99f                             \n"

+    // Blend 100 / 0 - Copy row unchanged.

+    LABELALIGN

+  "100:                                        \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    "sub       $0x10,%2                        \n"

+    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        100b                            \n"

+  "99:                                         \n"

+  : "+r"(dst_ptr),    // %0

+    "+r"(src_ptr),    // %1

+    "+r"(dst_width),  // %2

+    "+r"(source_y_fraction)  // %3

+  : "r"((intptr_t)(src_stride))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_INTERPOLATEROW_SSE2

+#ifdef HAS_INTERPOLATEROW_SSSE3

+// Bilinear filter 16x2 -> 16x1

+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                                    ptrdiff_t src_stride, int dst_width,

+                                    int source_y_fraction) {

+  asm volatile (

+    "sub       %1,%0                           \n"

+    "shr       %3                              \n"

+    "cmp       $0x0,%3                         \n"

+    "je        100f                            \n"

+    "cmp       $0x20,%3                        \n"

+    "je        75f                             \n"

+    "cmp       $0x40,%3                        \n"

+    "je        50f                             \n"

+    "cmp       $0x60,%3                        \n"

+    "je        25f                             \n"

+    "movd      %3,%%xmm0                       \n"

+    "neg       %3                              \n"

+    "add       $0x80,%3                        \n"

+    "movd      %3,%%xmm5                       \n"

+    "punpcklbw %%xmm0,%%xmm5                   \n"

+    "punpcklwd %%xmm5,%%xmm5                   \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    // General purpose row blend.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)

+    "movdqu    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm2,%%xmm0                   \n"

+    "punpckhbw %%xmm2,%%xmm1                   \n"

+    "pmaddubsw %%xmm5,%%xmm0                   \n"

+    "pmaddubsw %%xmm5,%%xmm1                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "psrlw     $0x7,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+    "jmp       99f                             \n"

+    // Blend 25 / 75.

+    LABELALIGN

+  "25:                                         \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        25b                             \n"

+    "jmp       99f                             \n"

+    // Blend 50 / 50.

+    LABELALIGN

+  "50:                                         \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        50b                             \n"

+    "jmp       99f                             \n"

+    // Blend 75 / 25.

+    LABELALIGN

+  "75:                                         \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        75b                             \n"

+    "jmp       99f                             \n"

+    // Blend 100 / 0 - Copy row unchanged.

+    LABELALIGN

+  "100:                                        \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    "sub       $0x10,%2                        \n"

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        100b                            \n"

+  "99:                                         \n"

+  : "+r"(dst_ptr),    // %0

+    "+r"(src_ptr),    // %1

+    "+r"(dst_width),  // %2

+    "+r"(source_y_fraction)  // %3

+  : "r"((intptr_t)(src_stride))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm5"

+#endif

+  );

+}

+#endif   // HAS_INTERPOLATEROW_SSSE3

+#ifdef HAS_INTERPOLATEROW_SSE2

+// Bilinear filter 16x2 -> 16x1

+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                                   ptrdiff_t src_stride, int dst_width,

+                                   int source_y_fraction) {

+  asm volatile (

+    "sub       %1,%0                           \n"

+    "shr       %3                              \n"

+    "cmp       $0x0,%3                         \n"

+    "je        100f                            \n"

+    "cmp       $0x20,%3                        \n"

+    "je        75f                             \n"

+    "cmp       $0x40,%3                        \n"

+    "je        50f                             \n"

+    "cmp       $0x60,%3                        \n"

+    "je        25f                             \n"

+    "movd      %3,%%xmm0                       \n"

+    "neg       %3                              \n"

+    "add       $0x80,%3                        \n"

+    "movd      %3,%%xmm5                       \n"

+    "punpcklbw %%xmm0,%%xmm5                   \n"

+    "punpcklwd %%xmm5,%%xmm5                   \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    "pxor      %%xmm4,%%xmm4                   \n"

+    // General purpose row blend.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2

+    "movdqu    %%xmm0,%%xmm1                   \n"

+    "movdqu    %%xmm2,%%xmm3                   \n"

+    "punpcklbw %%xmm4,%%xmm2                   \n"

+    "punpckhbw %%xmm4,%%xmm3                   \n"

+    "punpcklbw %%xmm4,%%xmm0                   \n"

+    "punpckhbw %%xmm4,%%xmm1                   \n"

+    "psubw     %%xmm0,%%xmm2                   \n"

+    "psubw     %%xmm1,%%xmm3                   \n"

+    "paddw     %%xmm2,%%xmm2                   \n"

+    "paddw     %%xmm3,%%xmm3                   \n"

+    "pmulhw    %%xmm5,%%xmm2                   \n"

+    "pmulhw    %%xmm5,%%xmm3                   \n"

+    "paddw     %%xmm2,%%xmm0                   \n"

+    "paddw     %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+    "jmp       99f                             \n"

+    // Blend 25 / 75.

+    LABELALIGN

+  "25:                                         \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        25b                             \n"

+    "jmp       99f                             \n"

+    // Blend 50 / 50.

+    LABELALIGN

+  "50:                                         \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        50b                             \n"

+    "jmp       99f                             \n"

+    // Blend 75 / 25.

+    LABELALIGN

+  "75:                                         \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

+    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "pavgb     %%xmm1,%%xmm0                   \n"

+    "sub       $0x10,%2                        \n"

+    BUNDLEALIGN

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        75b                             \n"

+    "jmp       99f                             \n"

+    // Blend 100 / 0 - Copy row unchanged.

+    LABELALIGN

+  "100:                                        \n"

+    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

+    "sub       $0x10,%2                        \n"

+    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        100b                            \n"

+  "99:                                         \n"

+  : "+r"(dst_ptr),    // %0

+    "+r"(src_ptr),    // %1

+    "+r"(dst_width),  // %2

+    "+r"(source_y_fraction)  // %3

+  : "r"((intptr_t)(src_stride))  // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_INTERPOLATEROW_SSE2

+#ifdef HAS_HALFROW_SSE2

+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix) {

+  asm volatile (

+    "sub       %0,%1                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0

+    "sub       $0x10,%2                        \n"

+    MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "jg        1b                              \n"

+  : "+r"(src_uv),  // %0

+    "+r"(dst_uv),  // %1

+    "+r"(pix)      // %2

+  : "r"((intptr_t)(src_uv_stride))  // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+      , "xmm0"

+#endif

+  );

+}

+#endif  // HAS_HALFROW_SSE2

+#ifdef HAS_ARGBTOBAYERROW_SSSE3

+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,

+                          uint32 selector, int pix) {

+  asm volatile (

+    // NaCL caveat - assumes movd is from GPR

+    "movd      %3,%%xmm5                       \n"

+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pshufb    %%xmm5,%%xmm0                   \n"

+    "pshufb    %%xmm5,%%xmm1                   \n"

+    "punpckldq %%xmm1,%%xmm0                   \n"

+    "sub       $0x8,%2                         \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_bayer), // %1

+    "+r"(pix)        // %2

+  : "g"(selector)    // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBTOBAYERROW_SSSE3

+#ifdef HAS_ARGBTOBAYERGGROW_SSE2

+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,

+                           uint32 selector, int pix) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrld     $0x18,%%xmm5                    \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrld     $0x8,%%xmm0                     \n"

+    "psrld     $0x8,%%xmm1                     \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packssdw  %%xmm1,%%xmm0                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x8,%2                         \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_bayer), // %1

+    "+r"(pix)        // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBTOBAYERGGROW_SSE2

+#ifdef HAS_ARGBSHUFFLEROW_SSSE3

+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                          const uint8* shuffler, int pix) {

+  asm volatile (

+    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pshufb    %%xmm5,%%xmm0                   \n"

+    "pshufb    %%xmm5,%%xmm1                   \n"

+    "sub       $0x8,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  : "r"(shuffler)    // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                                    const uint8* shuffler, int pix) {

+  asm volatile (

+    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pshufb    %%xmm5,%%xmm0                   \n"

+    "pshufb    %%xmm5,%%xmm1                   \n"

+    "sub       $0x8,%2                         \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  : "r"(shuffler)    // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBSHUFFLEROW_SSSE3

+#ifdef HAS_ARGBSHUFFLEROW_AVX2

+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix) {

+  asm volatile (

+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"

+    LABELALIGN

+  "1:                                          \n"

+    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

+    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

+    "lea       " MEMLEA(0x40,0) ",%0           \n"

+    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"

+    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"

+    "sub       $0x10,%2                        \n"

+    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"

+    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"

+    "lea       " MEMLEA(0x40,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(pix)        // %2

+  : "r"(shuffler)    // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBSHUFFLEROW_AVX2

+#ifdef HAS_ARGBSHUFFLEROW_SSE2

+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix) {

+  uintptr_t pixel_temp = 0u;

+  asm volatile (

+    "pxor      %%xmm5,%%xmm5                   \n"

+    "mov       " MEMACCESS(4) ",%k2            \n"

+    "cmp       $0x3000102,%k2                  \n"

+    "je        3012f                           \n"

+    "cmp       $0x10203,%k2                    \n"

+    "je        123f                            \n"

+    "cmp       $0x30201,%k2                    \n"

+    "je        321f                            \n"

+    "cmp       $0x2010003,%k2                  \n"

+    "je        2103f                           \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movzb     " MEMACCESS(4) ",%2             \n"

+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

+    "mov       %b2," MEMACCESS(1) "            \n"

+    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"

+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

+    "mov       %b2," MEMACCESS2(0x1,1) "       \n"

+    BUNDLEALIGN

+    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"

+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

+    "mov       %b2," MEMACCESS2(0x2,1) "       \n"

+    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"

+    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

+    "mov       %b2," MEMACCESS2(0x3,1) "       \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    "lea       " MEMLEA(0x4,1) ",%1            \n"

+    "sub       $0x1,%3                         \n"

+    "jg        1b                              \n"

+    "jmp       99f                             \n"

+    LABELALIGN

+  "123:                                        \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "punpckhbw %%xmm5,%%xmm1                   \n"

+    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"

+    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"

+    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"

+    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        123b                            \n"

+    "jmp       99f                             \n"

+    LABELALIGN

+  "321:                                        \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "punpckhbw %%xmm5,%%xmm1                   \n"

+    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"

+    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"

+    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"

+    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        321b                            \n"

+    "jmp       99f                             \n"

+    LABELALIGN

+  "2103:                                       \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "punpckhbw %%xmm5,%%xmm1                   \n"

+    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"

+    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"

+    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"

+    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        2103b                           \n"

+    "jmp       99f                             \n"

+    LABELALIGN

+  "3012:                                       \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "punpckhbw %%xmm5,%%xmm1                   \n"

+    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"

+    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"

+    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"

+    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        3012b                           \n"

+  "99:                                         \n"

+  : "+r"(src_argb),    // %0

+    "+r"(dst_argb),    // %1

+    "+d"(pixel_temp),  // %2

+    "+r"(pix)         // %3

+  : "r"(shuffler)      // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBSHUFFLEROW_SSE2

+#ifdef HAS_I422TOYUY2ROW_SSE2

+void I422ToYUY2Row_SSE2(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_frame, int width) {

+ asm volatile (

+    "sub       %1,%2                             \n"

+    LABELALIGN

+  "1:                                            \n"

+    "movq      " MEMACCESS(1) ",%%xmm2           \n"

+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3

+    "lea       " MEMLEA(0x8,1) ",%1              \n"

+    "punpcklbw %%xmm3,%%xmm2                     \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"

+    "lea       " MEMLEA(0x10,0) ",%0             \n"

+    "movdqa    %%xmm0,%%xmm1                     \n"

+    "punpcklbw %%xmm2,%%xmm0                     \n"

+    "punpckhbw %%xmm2,%%xmm1                     \n"

+    "movdqu    %%xmm0," MEMACCESS(3) "           \n"

+    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"

+    "lea       " MEMLEA(0x20,3) ",%3             \n"

+    "sub       $0x10,%4                          \n"

+    "jg         1b                               \n"

+    : "+r"(src_y),  // %0

+      "+r"(src_u),  // %1

+      "+r"(src_v),  // %2

+      "+r"(dst_frame),  // %3

+      "+rm"(width)  // %4

+    :

+    : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3"

+#endif

+  );

+}

+#endif  // HAS_I422TOYUY2ROW_SSE2

+#ifdef HAS_I422TOUYVYROW_SSE2

+void I422ToUYVYRow_SSE2(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_frame, int width) {

+ asm volatile (

+    "sub        %1,%2                            \n"

+    LABELALIGN

+  "1:                                            \n"

+    "movq      " MEMACCESS(1) ",%%xmm2           \n"

+    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3

+    "lea       " MEMLEA(0x8,1) ",%1              \n"

+    "punpcklbw %%xmm3,%%xmm2                     \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"

+    "movdqa    %%xmm2,%%xmm1                     \n"

+    "lea       " MEMLEA(0x10,0) ",%0             \n"

+    "punpcklbw %%xmm0,%%xmm1                     \n"

+    "punpckhbw %%xmm0,%%xmm2                     \n"

+    "movdqu    %%xmm1," MEMACCESS(3) "           \n"

+    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"

+    "lea       " MEMLEA(0x20,3) ",%3             \n"

+    "sub       $0x10,%4                          \n"

+    "jg         1b                               \n"

+    : "+r"(src_y),  // %0

+      "+r"(src_u),  // %1

+      "+r"(src_v),  // %2

+      "+r"(dst_frame),  // %3

+      "+rm"(width)  // %4

+    :

+    : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3"

+#endif

+  );

+}

+#endif  // HAS_I422TOUYVYROW_SSE2

+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2

+void ARGBPolynomialRow_SSE2(const uint8* src_argb,

+                            uint8* dst_argb, const float* poly,

+                            int width) {

+  asm volatile (

+    "pxor      %%xmm3,%%xmm3                   \n"

+    // 2 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "punpcklbw %%xmm3,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm4                   \n"

+    "punpcklwd %%xmm3,%%xmm0                   \n"

+    "punpckhwd %%xmm3,%%xmm4                   \n"

+    "cvtdq2ps  %%xmm0,%%xmm0                   \n"

+    "cvtdq2ps  %%xmm4,%%xmm4                   \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "movdqa    %%xmm4,%%xmm5                   \n"

+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"

+    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"

+    "addps     " MEMACCESS(3) ",%%xmm0         \n"

+    "addps     " MEMACCESS(3) ",%%xmm4         \n"

+    "movdqa    %%xmm1,%%xmm2                   \n"

+    "movdqa    %%xmm5,%%xmm6                   \n"

+    "mulps     %%xmm1,%%xmm2                   \n"

+    "mulps     %%xmm5,%%xmm6                   \n"

+    "mulps     %%xmm2,%%xmm1                   \n"

+    "mulps     %%xmm6,%%xmm5                   \n"

+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"

+    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"

+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"

+    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"

+    "addps     %%xmm2,%%xmm0                   \n"

+    "addps     %%xmm6,%%xmm4                   \n"

+    "addps     %%xmm1,%%xmm0                   \n"

+    "addps     %%xmm5,%%xmm4                   \n"

+    "cvttps2dq %%xmm0,%%xmm0                   \n"

+    "cvttps2dq %%xmm4,%%xmm4                   \n"

+    "packuswb  %%xmm4,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "sub       $0x2,%2                         \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(width)      // %2

+  : "r"(poly)        // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2

+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2

+void ARGBPolynomialRow_AVX2(const uint8* src_argb,

+                            uint8* dst_argb, const float* poly,

+                            int width) {

+  asm volatile (

+    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"

+    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"

+    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"

+    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"

+    // 2 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels

+    "lea         " MEMLEA(0x8,0) ",%0          \n"

+    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats

+    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X

+    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X

+    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X

+    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X

+    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X

+    "vcvttps2dq  %%ymm0,%%ymm0                 \n"

+    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"

+    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"

+    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"

+    "sub         $0x2,%2                       \n"

+    "vmovq       %%xmm0," MEMACCESS(1) "       \n"

+    "lea         " MEMLEA(0x8,1) ",%1          \n"

+    "jg          1b                            \n"

+    "vzeroupper                                \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(width)      // %2

+  : "r"(poly)        // %3

+  : "memory", "cc"

+#if defined(__SSE2__)

+// TODO(fbarchard): declare ymm usage when applicable.

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2

+#ifdef HAS_ARGBCOLORTABLEROW_X86

+// Tranform ARGB pixels with color table.

+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

+                           int width) {

+  uintptr_t pixel_temp = 0u;

+  asm volatile (

+    // 1 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movzb     " MEMACCESS(0) ",%1             \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1

+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"

+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"

+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1

+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"

+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"

+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1

+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"

+    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"

+    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1

+    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"

+    "dec       %2                              \n"

+    "jg        1b                              \n"

+  : "+r"(dst_argb),   // %0

+    "+d"(pixel_temp), // %1

+    "+r"(width)       // %2

+  : "r"(table_argb)   // %3

+  : "memory", "cc");

+}

+#endif  // HAS_ARGBCOLORTABLEROW_X86

+#ifdef HAS_RGBCOLORTABLEROW_X86

+// Tranform RGB pixels with color table.

+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {

+  uintptr_t pixel_temp = 0u;

+  asm volatile (

+    // 1 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movzb     " MEMACCESS(0) ",%1             \n"

+    "lea       " MEMLEA(0x4,0) ",%0            \n"

+    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1

+    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"

+    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"

+    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1

+    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"

+    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"

+    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1

+    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"

+    "dec       %2                              \n"

+    "jg        1b                              \n"

+  : "+r"(dst_argb),   // %0

+    "+d"(pixel_temp), // %1

+    "+r"(width)       // %2

+  : "r"(table_argb)   // %3

+  : "memory", "cc");

+}

+#endif  // HAS_RGBCOLORTABLEROW_X86

+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3

+// Tranform RGB pixels with luma table.

+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                                 int width,

+                                 const uint8* luma, uint32 lumacoeff) {

+  uintptr_t pixel_temp = 0u;

+  uintptr_t table_temp = 0u;

+  asm volatile (

+    "movd      %6,%%xmm3                       \n"

+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"

+    "pcmpeqb   %%xmm4,%%xmm4                   \n"

+    "psllw     $0x8,%%xmm4                     \n"

+    "pxor      %%xmm5,%%xmm5                   \n"

+    // 4 pixel loop.

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"

+    "pmaddubsw %%xmm3,%%xmm0                   \n"

+    "phaddw    %%xmm0,%%xmm0                   \n"

+    "pand      %%xmm4,%%xmm0                   \n"

+    "punpcklwd %%xmm5,%%xmm0                   \n"

+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+    "add       %5,%1                           \n"

+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+    "movzb     " MEMACCESS(2) ",%0             \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS(3) "            \n"

+    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0x1,3) "       \n"

+    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0x2,3) "       \n"

+    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"

+    "mov       %b0," MEMACCESS2(0x3,3) "       \n"

+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+    "add       %5,%1                           \n"

+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0x4,3) "       \n"

+    BUNDLEALIGN

+    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0x5,3) "       \n"

+    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0x6,3) "       \n"

+    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"

+    "mov       %b0," MEMACCESS2(0x7,3) "       \n"

+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+    "add       %5,%1                           \n"

+    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0x8,3) "       \n"

+    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0x9,3) "       \n"

+    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0xa,3) "       \n"

+    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"

+    "mov       %b0," MEMACCESS2(0xb,3) "       \n"

+    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+    "add       %5,%1                           \n"

+    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0xc,3) "       \n"

+    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0xd,3) "       \n"

+    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"

+    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

+    "mov       %b0," MEMACCESS2(0xe,3) "       \n"

+    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"

+    "mov       %b0," MEMACCESS2(0xf,3) "       \n"

+    "sub       $0x4,%4                         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "lea       " MEMLEA(0x10,3) ",%3           \n"

+    "jg        1b                              \n"

+  : "+d"(pixel_temp),  // %0

+    "+a"(table_temp),  // %1

+    "+r"(src_argb),    // %2

+    "+r"(dst_argb),    // %3

+    "+rm"(width)       // %4

+  : "r"(luma),         // %5

+    "rm"(lumacoeff)    // %6

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3

+#endif  // defined(__x86_64__) || defined(__i386__)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/row_win.cc

@@ -1,0 +1,7284 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for Visual C x86.

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+#ifdef HAS_ARGBTOYROW_SSSE3

+// Constants for ARGB.

+static const vec8 kARGBToY = {

+  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0

+};

+// JPeg full range.

+static const vec8 kARGBToYJ = {

+  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0

+};

+static const vec8 kARGBToU = {

+  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0

+};

+static const vec8 kARGBToUJ = {

+  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0

+};

+static const vec8 kARGBToV = {

+  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,

+};

+static const vec8 kARGBToVJ = {

+  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0

+};

+// vpermd for vphaddw + vpackuswb vpermd.

+static const lvec32 kPermdARGBToY_AVX = {

+  0, 4, 1, 5, 2, 6, 3, 7

+};

+// vpshufb for vphaddw + vpackuswb packed to shorts.

+static const lvec8 kShufARGBToUV_AVX = {

+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,

+  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,

+};

+// Constants for BGRA.

+static const vec8 kBGRAToY = {

+  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13

+};

+static const vec8 kBGRAToU = {

+  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112

+};

+static const vec8 kBGRAToV = {

+  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18

+};

+// Constants for ABGR.

+static const vec8 kABGRToY = {

+  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0

+};

+static const vec8 kABGRToU = {

+  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0

+};

+static const vec8 kABGRToV = {

+  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0

+};

+// Constants for RGBA.

+static const vec8 kRGBAToY = {

+  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33

+};

+static const vec8 kRGBAToU = {

+  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38

+};

+static const vec8 kRGBAToV = {

+  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112

+};

+static const uvec8 kAddY16 = {

+  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u

+};

+static const vec16 kAddYJ64 = {

+  64, 64, 64, 64, 64, 64, 64, 64

+};

+static const uvec8 kAddUV128 = {

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,

+  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

+};

+static const uvec16 kAddUVJ128 = {

+  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u

+};

+// Shuffle table for converting RGB24 to ARGB.

+static const uvec8 kShuffleMaskRGB24ToARGB = {

+  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u

+};

+// Shuffle table for converting RAW to ARGB.

+static const uvec8 kShuffleMaskRAWToARGB = {

+  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u

+};

+// Shuffle table for converting ARGB to RGB24.

+static const uvec8 kShuffleMaskARGBToRGB24 = {

+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting ARGB to RAW.

+static const uvec8 kShuffleMaskARGBToRAW = {

+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u

+};

+// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4

+static const uvec8 kShuffleMaskARGBToRGB24_0 = {

+  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u

+};

+// Shuffle table for converting ARGB to RAW.

+static const uvec8 kShuffleMaskARGBToRAW_0 = {

+  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u

+};

+// Duplicates gray value 3 times and fills in alpha opaque.

+__declspec(naked) __declspec(align(16))

+void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {

+  __asm {

+    mov        eax, [esp + 4]        // src_y

+    mov        edx, [esp + 8]        // dst_argb

+    mov        ecx, [esp + 12]       // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000

+    pslld      xmm5, 24

+    align      4

+  convertloop:

+    movq       xmm0, qword ptr [eax]

+    lea        eax,  [eax + 8]

+    punpcklbw  xmm0, xmm0

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm0

+    punpckhwd  xmm1, xmm1

+    por        xmm0, xmm5

+    por        xmm1, xmm5

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,

+                                  int pix) {

+  __asm {

+    mov        eax, [esp + 4]        // src_y

+    mov        edx, [esp + 8]        // dst_argb

+    mov        ecx, [esp + 12]       // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000

+    pslld      xmm5, 24

+    align      4

+  convertloop:

+    movq       xmm0, qword ptr [eax]

+    lea        eax,  [eax + 8]

+    punpcklbw  xmm0, xmm0

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm0

+    punpckhwd  xmm1, xmm1

+    por        xmm0, xmm5

+    por        xmm1, xmm5

+    movdqu     [edx], xmm0

+    movdqu     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {

+  __asm {

+    mov       eax, [esp + 4]   // src_rgb24

+    mov       edx, [esp + 8]   // dst_argb

+    mov       ecx, [esp + 12]  // pix

+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000

+    pslld     xmm5, 24

+    movdqa    xmm4, kShuffleMaskRGB24ToARGB

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax]

+    movdqu    xmm1, [eax + 16]

+    movdqu    xmm3, [eax + 32]

+    lea       eax, [eax + 48]

+    movdqa    xmm2, xmm3

+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}

+    pshufb    xmm2, xmm4

+    por       xmm2, xmm5

+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}

+    pshufb    xmm0, xmm4

+    movdqa    [edx + 32], xmm2

+    por       xmm0, xmm5

+    pshufb    xmm1, xmm4

+    movdqa    [edx], xmm0

+    por       xmm1, xmm5

+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}

+    pshufb    xmm3, xmm4

+    movdqa    [edx + 16], xmm1

+    por       xmm3, xmm5

+    sub       ecx, 16

+    movdqa    [edx + 48], xmm3

+    lea       edx, [edx + 64]

+    jg        convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,

+                        int pix) {

+  __asm {

+    mov       eax, [esp + 4]   // src_raw

+    mov       edx, [esp + 8]   // dst_argb

+    mov       ecx, [esp + 12]  // pix

+    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000

+    pslld     xmm5, 24

+    movdqa    xmm4, kShuffleMaskRAWToARGB

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax]

+    movdqu    xmm1, [eax + 16]

+    movdqu    xmm3, [eax + 32]

+    lea       eax, [eax + 48]

+    movdqa    xmm2, xmm3

+    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}

+    pshufb    xmm2, xmm4

+    por       xmm2, xmm5

+    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}

+    pshufb    xmm0, xmm4

+    movdqa    [edx + 32], xmm2

+    por       xmm0, xmm5

+    pshufb    xmm1, xmm4

+    movdqa    [edx], xmm0

+    por       xmm1, xmm5

+    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}

+    pshufb    xmm3, xmm4

+    movdqa    [edx + 16], xmm1

+    por       xmm3, xmm5

+    sub       ecx, 16

+    movdqa    [edx + 48], xmm3

+    lea       edx, [edx + 64]

+    jg        convertloop

+    ret

+  }

+}

+// pmul method to replicate bits.

+// Math to replicate bits:

+// (v << 8) | (v << 3)

+// v * 256 + v * 8

+// v * (256 + 8)

+// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3

+// 20 instructions.

+__declspec(naked) __declspec(align(16))

+void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,

+                          int pix) {

+  __asm {

+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits

+    movd      xmm5, eax

+    pshufd    xmm5, xmm5, 0

+    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits

+    movd      xmm6, eax

+    pshufd    xmm6, xmm6, 0

+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red

+    psllw     xmm3, 11

+    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green

+    psllw     xmm4, 10

+    psrlw     xmm4, 5

+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha

+    psllw     xmm7, 8

+    mov       eax, [esp + 4]   // src_rgb565

+    mov       edx, [esp + 8]   // dst_argb

+    mov       ecx, [esp + 12]  // pix

+    sub       edx, eax

+    sub       edx, eax

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565

+    movdqa    xmm1, xmm0

+    movdqa    xmm2, xmm0

+    pand      xmm1, xmm3    // R in upper 5 bits

+    psllw     xmm2, 11      // B in upper 5 bits

+    pmulhuw   xmm1, xmm5    // * (256 + 8)

+    pmulhuw   xmm2, xmm5    // * (256 + 8)

+    psllw     xmm1, 8

+    por       xmm1, xmm2    // RB

+    pand      xmm0, xmm4    // G in middle 6 bits

+    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)

+    por       xmm0, xmm7    // AG

+    movdqa    xmm2, xmm1

+    punpcklbw xmm1, xmm0

+    punpckhbw xmm2, xmm0

+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB

+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB

+    lea       eax, [eax + 16]

+    sub       ecx, 8

+    jg        convertloop

+    ret

+  }

+}

+// 24 instructions

+__declspec(naked) __declspec(align(16))

+void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,

+                            int pix) {

+  __asm {

+    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits

+    movd      xmm5, eax

+    pshufd    xmm5, xmm5, 0

+    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits

+    movd      xmm6, eax

+    pshufd    xmm6, xmm6, 0

+    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red

+    psllw     xmm3, 11

+    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green

+    psrlw     xmm4, 6

+    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha

+    psllw     xmm7, 8

+    mov       eax, [esp + 4]   // src_argb1555

+    mov       edx, [esp + 8]   // dst_argb

+    mov       ecx, [esp + 12]  // pix

+    sub       edx, eax

+    sub       edx, eax

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555

+    movdqa    xmm1, xmm0

+    movdqa    xmm2, xmm0

+    psllw     xmm1, 1       // R in upper 5 bits

+    psllw     xmm2, 11      // B in upper 5 bits

+    pand      xmm1, xmm3

+    pmulhuw   xmm2, xmm5    // * (256 + 8)

+    pmulhuw   xmm1, xmm5    // * (256 + 8)

+    psllw     xmm1, 8

+    por       xmm1, xmm2    // RB

+    movdqa    xmm2, xmm0

+    pand      xmm0, xmm4    // G in middle 5 bits

+    psraw     xmm2, 8       // A

+    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)

+    pand      xmm2, xmm7

+    por       xmm0, xmm2    // AG

+    movdqa    xmm2, xmm1

+    punpcklbw xmm1, xmm0

+    punpckhbw xmm2, xmm0

+    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB

+    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB

+    lea       eax, [eax + 16]

+    sub       ecx, 8

+    jg        convertloop

+    ret

+  }

+}

+// 18 instructions.

+__declspec(naked) __declspec(align(16))

+void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,

+                            int pix) {

+  __asm {

+    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f

+    movd      xmm4, eax

+    pshufd    xmm4, xmm4, 0

+    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles

+    pslld     xmm5, 4

+    mov       eax, [esp + 4]   // src_argb4444

+    mov       edx, [esp + 8]   // dst_argb

+    mov       ecx, [esp + 12]  // pix

+    sub       edx, eax

+    sub       edx, eax

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444

+    movdqa    xmm2, xmm0

+    pand      xmm0, xmm4    // mask low nibbles

+    pand      xmm2, xmm5    // mask high nibbles

+    movdqa    xmm1, xmm0

+    movdqa    xmm3, xmm2

+    psllw     xmm1, 4

+    psrlw     xmm3, 4

+    por       xmm0, xmm1

+    por       xmm2, xmm3

+    movdqa    xmm1, xmm0

+    punpcklbw xmm0, xmm2

+    punpckhbw xmm1, xmm2

+    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB

+    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB

+    lea       eax, [eax + 16]

+    sub       ecx, 8

+    jg        convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

+  __asm {

+    mov       eax, [esp + 4]   // src_argb

+    mov       edx, [esp + 8]   // dst_rgb

+    mov       ecx, [esp + 12]  // pix

+    movdqa    xmm6, kShuffleMaskARGBToRGB24

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb

+    movdqu    xmm1, [eax + 16]

+    movdqu    xmm2, [eax + 32]

+    movdqu    xmm3, [eax + 48]

+    lea       eax, [eax + 64]

+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB

+    pshufb    xmm1, xmm6

+    pshufb    xmm2, xmm6

+    pshufb    xmm3, xmm6

+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0

+    psrldq    xmm1, 4      // 8 bytes from 1

+    pslldq    xmm4, 12     // 4 bytes from 1 for 0

+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1

+    por       xmm0, xmm4   // 4 bytes from 1 for 0

+    pslldq    xmm5, 8      // 8 bytes from 2 for 1

+    movdqu    [edx], xmm0  // store 0

+    por       xmm1, xmm5   // 8 bytes from 2 for 1

+    psrldq    xmm2, 8      // 4 bytes from 2

+    pslldq    xmm3, 4      // 12 bytes from 3 for 2

+    por       xmm2, xmm3   // 12 bytes from 3 for 2

+    movdqu    [edx + 16], xmm1   // store 1

+    movdqu    [edx + 32], xmm2   // store 2

+    lea       edx, [edx + 48]

+    sub       ecx, 16

+    jg        convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

+  __asm {

+    mov       eax, [esp + 4]   // src_argb

+    mov       edx, [esp + 8]   // dst_rgb

+    mov       ecx, [esp + 12]  // pix

+    movdqa    xmm6, kShuffleMaskARGBToRAW

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb

+    movdqu    xmm1, [eax + 16]

+    movdqu    xmm2, [eax + 32]

+    movdqu    xmm3, [eax + 48]

+    lea       eax, [eax + 64]

+    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB

+    pshufb    xmm1, xmm6

+    pshufb    xmm2, xmm6

+    pshufb    xmm3, xmm6

+    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0

+    psrldq    xmm1, 4      // 8 bytes from 1

+    pslldq    xmm4, 12     // 4 bytes from 1 for 0

+    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1

+    por       xmm0, xmm4   // 4 bytes from 1 for 0

+    pslldq    xmm5, 8      // 8 bytes from 2 for 1

+    movdqu    [edx], xmm0  // store 0

+    por       xmm1, xmm5   // 8 bytes from 2 for 1

+    psrldq    xmm2, 8      // 4 bytes from 2

+    pslldq    xmm3, 4      // 12 bytes from 3 for 2

+    por       xmm2, xmm3   // 12 bytes from 3 for 2

+    movdqu    [edx + 16], xmm1   // store 1

+    movdqu    [edx + 32], xmm2   // store 2

+    lea       edx, [edx + 48]

+    sub       ecx, 16

+    jg        convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+  __asm {

+    mov       eax, [esp + 4]   // src_argb

+    mov       edx, [esp + 8]   // dst_rgb

+    mov       ecx, [esp + 12]  // pix

+    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f

+    psrld     xmm3, 27

+    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0

+    psrld     xmm4, 26

+    pslld     xmm4, 5

+    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800

+    pslld     xmm5, 11

+    align      4

+ convertloop:

+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb

+    movdqa    xmm1, xmm0    // B

+    movdqa    xmm2, xmm0    // G

+    pslld     xmm0, 8       // R

+    psrld     xmm1, 3       // B

+    psrld     xmm2, 5       // G

+    psrad     xmm0, 16      // R

+    pand      xmm1, xmm3    // B

+    pand      xmm2, xmm4    // G

+    pand      xmm0, xmm5    // R

+    por       xmm1, xmm2    // BG

+    por       xmm0, xmm1    // BGR

+    packssdw  xmm0, xmm0

+    lea       eax, [eax + 16]

+    movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565

+    lea       edx, [edx + 8]

+    sub       ecx, 4

+    jg        convertloop

+    ret

+  }

+}

+// TODO(fbarchard): Improve sign extension/packing.

+__declspec(naked) __declspec(align(16))

+void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+  __asm {

+    mov       eax, [esp + 4]   // src_argb

+    mov       edx, [esp + 8]   // dst_rgb

+    mov       ecx, [esp + 12]  // pix

+    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f

+    psrld     xmm4, 27

+    movdqa    xmm5, xmm4       // generate mask 0x000003e0

+    pslld     xmm5, 5

+    movdqa    xmm6, xmm4       // generate mask 0x00007c00

+    pslld     xmm6, 10

+    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000

+    pslld     xmm7, 15

+    align      4

+ convertloop:

+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb

+    movdqa    xmm1, xmm0    // B

+    movdqa    xmm2, xmm0    // G

+    movdqa    xmm3, xmm0    // R

+    psrad     xmm0, 16      // A

+    psrld     xmm1, 3       // B

+    psrld     xmm2, 6       // G

+    psrld     xmm3, 9       // R

+    pand      xmm0, xmm7    // A

+    pand      xmm1, xmm4    // B

+    pand      xmm2, xmm5    // G

+    pand      xmm3, xmm6    // R

+    por       xmm0, xmm1    // BA

+    por       xmm2, xmm3    // GR

+    por       xmm0, xmm2    // BGRA

+    packssdw  xmm0, xmm0

+    lea       eax, [eax + 16]

+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555

+    lea       edx, [edx + 8]

+    sub       ecx, 4

+    jg        convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {

+  __asm {

+    mov       eax, [esp + 4]   // src_argb

+    mov       edx, [esp + 8]   // dst_rgb

+    mov       ecx, [esp + 12]  // pix

+    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000

+    psllw     xmm4, 12

+    movdqa    xmm3, xmm4       // generate mask 0x00f000f0

+    psrlw     xmm3, 8

+    align      4

+ convertloop:

+    movdqa    xmm0, [eax]   // fetch 4 pixels of argb

+    movdqa    xmm1, xmm0

+    pand      xmm0, xmm3    // low nibble

+    pand      xmm1, xmm4    // high nibble

+    psrl      xmm0, 4

+    psrl      xmm1, 8

+    por       xmm0, xmm1

+    packuswb  xmm0, xmm0

+    lea       eax, [eax + 16]

+    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444

+    lea       edx, [edx + 8]

+    sub       ecx, 4

+    jg        convertloop

+    ret

+  }

+}

+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.

+__declspec(naked) __declspec(align(16))

+void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kARGBToY

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+// Convert 16 ARGB pixels (64 bytes) to 16 Y values.

+__declspec(naked) __declspec(align(16))

+void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm4, kARGBToYJ

+    movdqa     xmm5, kAddYJ64

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    paddw      xmm0, xmm5  // Add .5 for rounding.

+    paddw      xmm2, xmm5

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+#ifdef HAS_ARGBTOYROW_AVX2

+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.

+__declspec(naked) __declspec(align(32))

+void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    vbroadcastf128 ymm4, kARGBToY

+    vbroadcastf128 ymm5, kAddY16

+    vmovdqa    ymm6, kPermdARGBToY_AVX

+    align      4

+ convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    vmovdqu    ymm2, [eax + 64]

+    vmovdqu    ymm3, [eax + 96]

+    vpmaddubsw ymm0, ymm0, ymm4

+    vpmaddubsw ymm1, ymm1, ymm4

+    vpmaddubsw ymm2, ymm2, ymm4

+    vpmaddubsw ymm3, ymm3, ymm4

+    lea        eax, [eax + 128]

+    vphaddw    ymm0, ymm0, ymm1  // mutates.

+    vphaddw    ymm2, ymm2, ymm3

+    vpsrlw     ymm0, ymm0, 7

+    vpsrlw     ymm2, ymm2, 7

+    vpackuswb  ymm0, ymm0, ymm2  // mutates.

+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.

+    vpaddb     ymm0, ymm0, ymm5

+    sub        ecx, 32

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    jg         convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  //  HAS_ARGBTOYROW_AVX2

+#ifdef HAS_ARGBTOYROW_AVX2

+// Convert 32 ARGB pixels (128 bytes) to 32 Y values.

+__declspec(naked) __declspec(align(32))

+void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    vbroadcastf128 ymm4, kARGBToYJ

+    vbroadcastf128 ymm5, kAddYJ64

+    vmovdqa    ymm6, kPermdARGBToY_AVX

+    align      4

+ convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    vmovdqu    ymm2, [eax + 64]

+    vmovdqu    ymm3, [eax + 96]

+    vpmaddubsw ymm0, ymm0, ymm4

+    vpmaddubsw ymm1, ymm1, ymm4

+    vpmaddubsw ymm2, ymm2, ymm4

+    vpmaddubsw ymm3, ymm3, ymm4

+    lea        eax, [eax + 128]

+    vphaddw    ymm0, ymm0, ymm1  // mutates.

+    vphaddw    ymm2, ymm2, ymm3

+    vpaddw     ymm0, ymm0, ymm5  // Add .5 for rounding.

+    vpaddw     ymm2, ymm2, ymm5

+    vpsrlw     ymm0, ymm0, 7

+    vpsrlw     ymm2, ymm2, 7

+    vpackuswb  ymm0, ymm0, ymm2  // mutates.

+    vpermd     ymm0, ymm6, ymm0  // For vphaddw + vpackuswb mutation.

+    sub        ecx, 32

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    jg         convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  //  HAS_ARGBTOYJROW_AVX2

+__declspec(naked) __declspec(align(16))

+void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kARGBToY

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm4, kARGBToYJ

+    movdqa     xmm5, kAddYJ64

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    paddw      xmm0, xmm5

+    paddw      xmm2, xmm5

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kBGRAToY

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kBGRAToY

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kABGRToY

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kABGRToY

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kRGBAToY

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_y */

+    mov        ecx, [esp + 12]  /* pix */

+    movdqa     xmm5, kAddY16

+    movdqa     xmm4, kRGBAToY

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm2, xmm4

+    pmaddubsw  xmm3, xmm4

+    lea        eax, [eax + 64]

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psrlw      xmm0, 7

+    psrlw      xmm2, 7

+    packuswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kARGBToU

+    movdqa     xmm6, kARGBToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pavgb      xmm0, [eax + esi]

+    pavgb      xmm1, [eax + esi + 16]

+    pavgb      xmm2, [eax + esi + 32]

+    pavgb      xmm3, [eax + esi + 48]

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                        uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kARGBToUJ

+    movdqa     xmm6, kARGBToVJ

+    movdqa     xmm5, kAddUVJ128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pavgb      xmm0, [eax + esi]

+    pavgb      xmm1, [eax + esi + 16]

+    pavgb      xmm2, [eax + esi + 32]

+    pavgb      xmm3, [eax + esi + 48]

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    paddw      xmm0, xmm5            // +.5 rounding -> unsigned

+    paddw      xmm1, xmm5

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#ifdef HAS_ARGBTOUVROW_AVX2

+__declspec(naked) __declspec(align(32))

+void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,

+                      uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    vbroadcastf128 ymm5, kAddUV128

+    vbroadcastf128 ymm6, kARGBToV

+    vbroadcastf128 ymm7, kARGBToU

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 32x2 argb pixels to 16x1 */

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    vmovdqu    ymm2, [eax + 64]

+    vmovdqu    ymm3, [eax + 96]

+    vpavgb     ymm0, ymm0, [eax + esi]

+    vpavgb     ymm1, ymm1, [eax + esi + 32]

+    vpavgb     ymm2, ymm2, [eax + esi + 64]

+    vpavgb     ymm3, ymm3, [eax + esi + 96]

+    lea        eax,  [eax + 128]

+    vshufps    ymm4, ymm0, ymm1, 0x88

+    vshufps    ymm0, ymm0, ymm1, 0xdd

+    vpavgb     ymm0, ymm0, ymm4  // mutated by vshufps

+    vshufps    ymm4, ymm2, ymm3, 0x88

+    vshufps    ymm2, ymm2, ymm3, 0xdd

+    vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 32 different pixels, its 16 pixels of U and 16 of V

+    vpmaddubsw ymm1, ymm0, ymm7  // U

+    vpmaddubsw ymm3, ymm2, ymm7

+    vpmaddubsw ymm0, ymm0, ymm6  // V

+    vpmaddubsw ymm2, ymm2, ymm6

+    vphaddw    ymm1, ymm1, ymm3  // mutates

+    vphaddw    ymm0, ymm0, ymm2

+    vpsraw     ymm1, ymm1, 8

+    vpsraw     ymm0, ymm0, 8

+    vpacksswb  ymm0, ymm1, ymm0  // mutates

+    vpermq     ymm0, ymm0, 0xd8  // For vpacksswb

+    vpshufb    ymm0, ymm0, kShufARGBToUV_AVX  // For vshufps + vphaddw

+    vpaddb     ymm0, ymm0, ymm5  // -> unsigned

+    // step 3 - store 16 U and 16 V values

+    sub         ecx, 32

+    vextractf128 [edx], ymm0, 0 // U

+    vextractf128 [edx + edi], ymm0, 1 // V

+    lea        edx, [edx + 16]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBTOUVROW_AVX2

+__declspec(naked) __declspec(align(16))

+void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kARGBToU

+    movdqa     xmm6, kARGBToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    movdqu     xmm4, [eax + esi]

+    pavgb      xmm0, xmm4

+    movdqu     xmm4, [eax + esi + 16]

+    pavgb      xmm1, xmm4

+    movdqu     xmm4, [eax + esi + 32]

+    pavgb      xmm2, xmm4

+    movdqu     xmm4, [eax + esi + 48]

+    pavgb      xmm3, xmm4

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kARGBToUJ

+    movdqa     xmm6, kARGBToVJ

+    movdqa     xmm5, kAddUVJ128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    movdqu     xmm4, [eax + esi]

+    pavgb      xmm0, xmm4

+    movdqu     xmm4, [eax + esi + 16]

+    pavgb      xmm1, xmm4

+    movdqu     xmm4, [eax + esi + 32]

+    pavgb      xmm2, xmm4

+    movdqu     xmm4, [eax + esi + 48]

+    pavgb      xmm3, xmm4

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    paddw      xmm0, xmm5            // +.5 rounding -> unsigned

+    paddw      xmm1, xmm5

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToUV444Row_SSSE3(const uint8* src_argb0,

+                          uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]   // src_argb

+    mov        edx, [esp + 4 + 8]   // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // pix

+    movdqa     xmm7, kARGBToU

+    movdqa     xmm6, kARGBToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* convert to U and V */

+    movdqa     xmm0, [eax]          // U

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm7

+    pmaddubsw  xmm1, xmm7

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm3, xmm7

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psraw      xmm0, 8

+    psraw      xmm2, 8

+    packsswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx,  16

+    movdqa     [edx], xmm0

+    movdqa     xmm0, [eax]          // V

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm6

+    pmaddubsw  xmm1, xmm6

+    pmaddubsw  xmm2, xmm6

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psraw      xmm0, 8

+    psraw      xmm2, 8

+    packsswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    lea        eax,  [eax + 64]

+    movdqa     [edx + edi], xmm0

+    lea        edx,  [edx + 16]

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,

+                                    uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]   // src_argb

+    mov        edx, [esp + 4 + 8]   // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // pix

+    movdqa     xmm7, kARGBToU

+    movdqa     xmm6, kARGBToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* convert to U and V */

+    movdqu     xmm0, [eax]          // U

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm7

+    pmaddubsw  xmm1, xmm7

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm3, xmm7

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psraw      xmm0, 8

+    psraw      xmm2, 8

+    packsswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    sub        ecx,  16

+    movdqu     [edx], xmm0

+    movdqu     xmm0, [eax]          // V

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    pmaddubsw  xmm0, xmm6

+    pmaddubsw  xmm1, xmm6

+    pmaddubsw  xmm2, xmm6

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm1

+    phaddw     xmm2, xmm3

+    psraw      xmm0, 8

+    psraw      xmm2, 8

+    packsswb   xmm0, xmm2

+    paddb      xmm0, xmm5

+    lea        eax,  [eax + 64]

+    movdqu     [edx + edi], xmm0

+    lea        edx,  [edx + 16]

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToUV422Row_SSSE3(const uint8* src_argb0,

+                          uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]   // src_argb

+    mov        edx, [esp + 4 + 8]   // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // pix

+    movdqa     xmm7, kARGBToU

+    movdqa     xmm6, kARGBToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,

+                                    uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]   // src_argb

+    mov        edx, [esp + 4 + 8]   // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // pix

+    movdqa     xmm7, kARGBToU

+    movdqa     xmm6, kARGBToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kBGRAToU

+    movdqa     xmm6, kBGRAToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pavgb      xmm0, [eax + esi]

+    pavgb      xmm1, [eax + esi + 16]

+    pavgb      xmm2, [eax + esi + 32]

+    pavgb      xmm3, [eax + esi + 48]

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kBGRAToU

+    movdqa     xmm6, kBGRAToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    movdqu     xmm4, [eax + esi]

+    pavgb      xmm0, xmm4

+    movdqu     xmm4, [eax + esi + 16]

+    pavgb      xmm1, xmm4

+    movdqu     xmm4, [eax + esi + 32]

+    pavgb      xmm2, xmm4

+    movdqu     xmm4, [eax + esi + 48]

+    pavgb      xmm3, xmm4

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kABGRToU

+    movdqa     xmm6, kABGRToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pavgb      xmm0, [eax + esi]

+    pavgb      xmm1, [eax + esi + 16]

+    pavgb      xmm2, [eax + esi + 32]

+    pavgb      xmm3, [eax + esi + 48]

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kABGRToU

+    movdqa     xmm6, kABGRToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    movdqu     xmm4, [eax + esi]

+    pavgb      xmm0, xmm4

+    movdqu     xmm4, [eax + esi + 16]

+    pavgb      xmm1, xmm4

+    movdqu     xmm4, [eax + esi + 32]

+    pavgb      xmm2, xmm4

+    movdqu     xmm4, [eax + esi + 48]

+    pavgb      xmm3, xmm4

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                       uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kRGBAToU

+    movdqa     xmm6, kRGBAToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    pavgb      xmm0, [eax + esi]

+    pavgb      xmm1, [eax + esi + 16]

+    pavgb      xmm2, [eax + esi + 32]

+    pavgb      xmm3, [eax + esi + 48]

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,

+                                 uint8* dst_u, uint8* dst_v, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb

+    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // pix

+    movdqa     xmm7, kRGBAToU

+    movdqa     xmm6, kRGBAToV

+    movdqa     xmm5, kAddUV128

+    sub        edi, edx             // stride from u to v

+    align      4

+ convertloop:

+    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + 32]

+    movdqu     xmm3, [eax + 48]

+    movdqu     xmm4, [eax + esi]

+    pavgb      xmm0, xmm4

+    movdqu     xmm4, [eax + esi + 16]

+    pavgb      xmm1, xmm4

+    movdqu     xmm4, [eax + esi + 32]

+    pavgb      xmm2, xmm4

+    movdqu     xmm4, [eax + esi + 48]

+    pavgb      xmm3, xmm4

+    lea        eax,  [eax + 64]

+    movdqa     xmm4, xmm0

+    shufps     xmm0, xmm1, 0x88

+    shufps     xmm4, xmm1, 0xdd

+    pavgb      xmm0, xmm4

+    movdqa     xmm4, xmm2

+    shufps     xmm2, xmm3, 0x88

+    shufps     xmm4, xmm3, 0xdd

+    pavgb      xmm2, xmm4

+    // step 2 - convert to U and V

+    // from here down is very similar to Y code except

+    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    pmaddubsw  xmm0, xmm7  // U

+    pmaddubsw  xmm2, xmm7

+    pmaddubsw  xmm1, xmm6  // V

+    pmaddubsw  xmm3, xmm6

+    phaddw     xmm0, xmm2

+    phaddw     xmm1, xmm3

+    psraw      xmm0, 8

+    psraw      xmm1, 8

+    packsswb   xmm0, xmm1

+    paddb      xmm0, xmm5            // -> unsigned

+    // step 3 - store 8 U and 8 V values

+    sub        ecx, 16

+    movlps     qword ptr [edx], xmm0 // U

+    movhps     qword ptr [edx + edi], xmm0 // V

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBTOYROW_SSSE3

+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */

+#define UB 127 /* min(63,(int8)(2.018 * 64)) */

+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */

+#define UR 0

+#define VB 0

+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */

+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */

+// Bias

+#define BB UB * 128 + VB * 128

+#define BG UG * 128 + VG * 128

+#define BR UR * 128 + VR * 128

+#ifdef HAS_I422TOARGBROW_AVX2

+static const lvec8 kUVToB_AVX = {

+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,

+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB

+};

+static const lvec8 kUVToR_AVX = {

+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,

+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR

+};

+static const lvec8 kUVToG_AVX = {

+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG

+};

+static const lvec16 kYToRgb_AVX = {

+  YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG

+};

+static const lvec16 kYSub16_AVX = {

+  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16

+};

+static const lvec16 kUVBiasB_AVX = {

+  BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB

+};

+static const lvec16 kUVBiasG_AVX = {

+  BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG

+};

+static const lvec16 kUVBiasR_AVX = {

+  BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR

+};

+// 16 pixels

+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

+__declspec(naked) __declspec(align(16))

+void I422ToARGBRow_AVX2(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_argb,

+                         int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // argb

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpxor      ymm4, ymm4, ymm4

+    align      4

+ convertloop:

+    vmovq      xmm0, qword ptr [esi]          //  U

+    vmovq      xmm1, qword ptr [esi + edi]    //  V

+    lea        esi,  [esi + 8]

+    vpunpcklbw ymm0, ymm0, ymm1               // UV

+    vpermq     ymm0, ymm0, 0xd8

+    vpunpcklwd ymm0, ymm0, ymm0              // UVUV

+    vpmaddubsw ymm2, ymm0, kUVToB_AVX        // scale B UV

+    vpmaddubsw ymm1, ymm0, kUVToG_AVX        // scale G UV

+    vpmaddubsw ymm0, ymm0, kUVToR_AVX        // scale R UV

+    vpsubw     ymm2, ymm2, kUVBiasB_AVX      // unbias back to signed

+    vpsubw     ymm1, ymm1, kUVBiasG_AVX

+    vpsubw     ymm0, ymm0, kUVBiasR_AVX

+    // Step 2: Find Y contribution to 16 R,G,B values

+    vmovdqu    xmm3, [eax]                  // NOLINT

+    lea        eax, [eax + 16]

+    vpermq     ymm3, ymm3, 0xd8

+    vpunpcklbw ymm3, ymm3, ymm4

+    vpsubsw    ymm3, ymm3, kYSub16_AVX

+    vpmullw    ymm3, ymm3, kYToRgb_AVX

+    vpaddsw    ymm2, ymm2, ymm3           // B += Y

+    vpaddsw    ymm1, ymm1, ymm3           // G += Y

+    vpaddsw    ymm0, ymm0, ymm3           // R += Y

+    vpsraw     ymm2, ymm2, 6

+    vpsraw     ymm1, ymm1, 6

+    vpsraw     ymm0, ymm0, 6

+    vpackuswb  ymm2, ymm2, ymm2           // B

+    vpackuswb  ymm1, ymm1, ymm1           // G

+    vpackuswb  ymm0, ymm0, ymm0           // R

+    // Step 3: Weave into ARGB

+    vpunpcklbw ymm2, ymm2, ymm1           // BG

+    vpermq     ymm2, ymm2, 0xd8

+    vpunpcklbw ymm0, ymm0, ymm5           // RA

+    vpermq     ymm0, ymm0, 0xd8

+    vpunpcklwd ymm1, ymm2, ymm0           // BGRA first 8 pixels

+    vpunpckhwd ymm2, ymm2, ymm0           // BGRA next 8 pixels

+    vmovdqu    [edx], ymm1

+    vmovdqu    [edx + 32], ymm2

+    lea        edx,  [edx + 64]

+    sub        ecx, 16

+    jg         convertloop

+    vzeroupper

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_I422TOARGBROW_AVX2

+#ifdef HAS_I422TOARGBROW_SSSE3

+static const vec8 kUVToB = {

+  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB

+};

+static const vec8 kUVToR = {

+  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR

+};

+static const vec8 kUVToG = {

+  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG

+};

+static const vec8 kVUToB = {

+  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,

+};

+static const vec8 kVUToR = {

+  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,

+};

+static const vec8 kVUToG = {

+  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+};

+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };

+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };

+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };

+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };

+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };

+// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.

+// Read 8 UV from 444.

+#define READYUV444 __asm {                                                     \

+    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \

+    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \

+    __asm lea        esi,  [esi + 8]                                           \

+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

+  }

+// Read 4 UV from 422, upsample to 8 UV.

+#define READYUV422 __asm {                                                     \

+    __asm movd       xmm0, [esi]          /* U */                              \

+    __asm movd       xmm1, [esi + edi]    /* V */                              \

+    __asm lea        esi,  [esi + 4]                                           \

+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+  }

+// Read 2 UV from 411, upsample to 8 UV.

+#define READYUV411 __asm {                                                     \

+    __asm movzx      ebx, word ptr [esi]        /* U */           /* NOLINT */ \

+    __asm movd       xmm0, ebx                                                 \

+    __asm movzx      ebx, word ptr [esi + edi]  /* V */           /* NOLINT */ \

+    __asm movd       xmm1, ebx                                                 \

+    __asm lea        esi,  [esi + 2]                                           \

+    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \

+  }

+// Read 4 UV from NV12, upsample to 8 UV.

+#define READNV12 __asm {                                                       \

+    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \

+    __asm lea        esi,  [esi + 8]                                           \

+    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+  }

+// Convert 8 pixels: 8 UV and 8 Y.

+#define YUVTORGB __asm {                                                       \

+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \

+    __asm movdqa     xmm1, xmm0                                                \

+    __asm movdqa     xmm2, xmm0                                                \

+    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \

+    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \

+    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \

+    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \

+    __asm psubw      xmm1, kUVBiasG                                            \

+    __asm psubw      xmm2, kUVBiasR                                            \

+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \

+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \

+    __asm lea        eax, [eax + 8]                                            \

+    __asm punpcklbw  xmm3, xmm4                                                \

+    __asm psubsw     xmm3, kYSub16                                             \

+    __asm pmullw     xmm3, kYToRgb                                             \

+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \

+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \

+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \

+    __asm psraw      xmm0, 6                                                   \

+    __asm psraw      xmm1, 6                                                   \

+    __asm psraw      xmm2, 6                                                   \

+    __asm packuswb   xmm0, xmm0           /* B */                              \

+    __asm packuswb   xmm1, xmm1           /* G */                              \

+    __asm packuswb   xmm2, xmm2           /* R */                              \

+  }

+// Convert 8 pixels: 8 VU and 8 Y.

+#define YVUTORGB __asm {                                                       \

+    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \

+    __asm movdqa     xmm1, xmm0                                                \

+    __asm movdqa     xmm2, xmm0                                                \

+    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \

+    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \

+    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \

+    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \

+    __asm psubw      xmm1, kUVBiasG                                            \

+    __asm psubw      xmm2, kUVBiasR                                            \

+    /* Step 2: Find Y contribution to 8 R,G,B values */                        \

+    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \

+    __asm lea        eax, [eax + 8]                                            \

+    __asm punpcklbw  xmm3, xmm4                                                \

+    __asm psubsw     xmm3, kYSub16                                             \

+    __asm pmullw     xmm3, kYToRgb                                             \

+    __asm paddsw     xmm0, xmm3           /* B += Y */                         \

+    __asm paddsw     xmm1, xmm3           /* G += Y */                         \

+    __asm paddsw     xmm2, xmm3           /* R += Y */                         \

+    __asm psraw      xmm0, 6                                                   \

+    __asm psraw      xmm1, 6                                                   \

+    __asm psraw      xmm2, 6                                                   \

+    __asm packuswb   xmm0, xmm0           /* B */                              \

+    __asm packuswb   xmm1, xmm1           /* G */                              \

+    __asm packuswb   xmm2, xmm2           /* R */                              \

+  }

+// 8 pixels, dest aligned 16.

+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void I444ToARGBRow_SSSE3(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_argb,

+                         int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // argb

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV444

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void I422ToRGB24Row_SSSE3(const uint8* y_buf,

+                          const uint8* u_buf,

+                          const uint8* v_buf,

+                          uint8* dst_rgb24,

+                          int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // rgb24

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pxor       xmm4, xmm4

+    movdqa     xmm5, kShuffleMaskARGBToRGB24_0

+    movdqa     xmm6, kShuffleMaskARGBToRGB24

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into RRGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm2           // RR

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels

+    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.

+    pshufb     xmm1, xmm6           // Pack into first 12 bytes.

+    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1

+    movq       qword ptr [edx], xmm0  // First 8 bytes

+    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.

+    lea        edx,  [edx + 24]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void I422ToRAWRow_SSSE3(const uint8* y_buf,

+                        const uint8* u_buf,

+                        const uint8* v_buf,

+                        uint8* dst_raw,

+                        int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // raw

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pxor       xmm4, xmm4

+    movdqa     xmm5, kShuffleMaskARGBToRAW_0

+    movdqa     xmm6, kShuffleMaskARGBToRAW

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into RRGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm2           // RR

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels

+    pshufb     xmm0, xmm5           // Pack into first 8 and last 4 bytes.

+    pshufb     xmm1, xmm6           // Pack into first 12 bytes.

+    palignr    xmm1, xmm0, 12       // last 4 bytes of xmm0 + 12 from xmm1

+    movq       qword ptr [edx], xmm0  // First 8 bytes

+    movdqu     [edx + 8], xmm1      // Last 16 bytes. = 24 bytes, 8 RGB pixels.

+    lea        edx,  [edx + 24]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, dest unaligned.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void I422ToRGB565Row_SSSE3(const uint8* y_buf,

+                           const uint8* u_buf,

+                           const uint8* v_buf,

+                           uint8* rgb565_buf,

+                           int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // rgb565

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pxor       xmm4, xmm4

+    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f

+    psrld      xmm5, 27

+    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0

+    psrld      xmm6, 26

+    pslld      xmm6, 5

+    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800

+    pslld      xmm7, 11

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into RRGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm2           // RR

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRR first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRR next 4 pixels

+    // Step 3b: RRGB -> RGB565

+    movdqa     xmm3, xmm0    // B  first 4 pixels of argb

+    movdqa     xmm2, xmm0    // G

+    pslld      xmm0, 8       // R

+    psrld      xmm3, 3       // B

+    psrld      xmm2, 5       // G

+    psrad      xmm0, 16      // R

+    pand       xmm3, xmm5    // B

+    pand       xmm2, xmm6    // G

+    pand       xmm0, xmm7    // R

+    por        xmm3, xmm2    // BG

+    por        xmm0, xmm3    // BGR

+    movdqa     xmm3, xmm1    // B  next 4 pixels of argb

+    movdqa     xmm2, xmm1    // G

+    pslld      xmm1, 8       // R

+    psrld      xmm3, 3       // B

+    psrld      xmm2, 5       // G

+    psrad      xmm1, 16      // R

+    pand       xmm3, xmm5    // B

+    pand       xmm2, xmm6    // G

+    pand       xmm1, xmm7    // R

+    por        xmm3, xmm2    // BG

+    por        xmm1, xmm3    // BGR

+    packssdw   xmm0, xmm1

+    sub        ecx, 8

+    movdqu     [edx], xmm0   // store 8 pixels of RGB565

+    lea        edx, [edx + 16]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void I422ToARGBRow_SSSE3(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_argb,

+                         int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // argb

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+// Similar to I420 but duplicate UV once more.

+__declspec(naked) __declspec(align(16))

+void I411ToARGBRow_SSSE3(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_argb,

+                         int width) {

+  __asm {

+    push       ebx

+    push       esi

+    push       edi

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ecx, [esp + 12 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV411  // modifies EBX

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    pop        ebx

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void NV12ToARGBRow_SSSE3(const uint8* y_buf,

+                         const uint8* uv_buf,

+                         uint8* dst_argb,

+                         int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // Y

+    mov        esi, [esp + 4 + 8]   // UV

+    mov        edx, [esp + 4 + 12]  // argb

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READNV12

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void NV21ToARGBRow_SSSE3(const uint8* y_buf,

+                         const uint8* uv_buf,

+                         uint8* dst_argb,

+                         int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // Y

+    mov        esi, [esp + 4 + 8]   // VU

+    mov        edx, [esp + 4 + 12]  // argb

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READNV12

+    YVUTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, unaligned.

+// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* u_buf,

+                                   const uint8* v_buf,

+                                   uint8* dst_argb,

+                                   int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // argb

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV444

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqu     [edx], xmm0

+    movdqu     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, unaligned.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* u_buf,

+                                   const uint8* v_buf,

+                                   uint8* dst_argb,

+                                   int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // argb

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqu     [edx], xmm0

+    movdqu     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, unaligned.

+// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+// Similar to I420 but duplicate UV once more.

+__declspec(naked) __declspec(align(16))

+void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* u_buf,

+                                   const uint8* v_buf,

+                                   uint8* dst_argb,

+                                   int width) {

+  __asm {

+    push       ebx

+    push       esi

+    push       edi

+    mov        eax, [esp + 12 + 4]   // Y

+    mov        esi, [esp + 12 + 8]   // U

+    mov        edi, [esp + 12 + 12]  // V

+    mov        edx, [esp + 12 + 16]  // argb

+    mov        ecx, [esp + 12 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV411  // modifies EBX

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqu     [edx], xmm0

+    movdqu     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    pop        ebx

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* uv_buf,

+                                   uint8* dst_argb,

+                                   int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // Y

+    mov        esi, [esp + 4 + 8]   // UV

+    mov        edx, [esp + 4 + 12]  // argb

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READNV12

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqu     [edx], xmm0

+    movdqu     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+// 8 pixels, dest aligned 16.

+// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

+__declspec(naked) __declspec(align(16))

+void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* uv_buf,

+                                   uint8* dst_argb,

+                                   int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // Y

+    mov        esi, [esp + 4 + 8]   // VU

+    mov        edx, [esp + 4 + 12]  // argb

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READNV12

+    YVUTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm0, xmm1           // BG

+    punpcklbw  xmm2, xmm5           // RA

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels

+    movdqu     [edx], xmm0

+    movdqu     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I422ToBGRARow_SSSE3(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_bgra,

+                         int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // bgra

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into BGRA

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    punpcklbw  xmm1, xmm0           // GB

+    punpcklbw  xmm5, xmm2           // AR

+    movdqa     xmm0, xmm5

+    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels

+    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels

+    movdqa     [edx], xmm5

+    movdqa     [edx + 16], xmm0

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* u_buf,

+                                   const uint8* v_buf,

+                                   uint8* dst_bgra,

+                                   int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // bgra

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into BGRA

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    punpcklbw  xmm1, xmm0           // GB

+    punpcklbw  xmm5, xmm2           // AR

+    movdqa     xmm0, xmm5

+    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels

+    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels

+    movdqu     [edx], xmm5

+    movdqu     [edx + 16], xmm0

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I422ToABGRRow_SSSE3(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_abgr,

+                         int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // abgr

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm2, xmm1           // RG

+    punpcklbw  xmm0, xmm5           // BA

+    movdqa     xmm1, xmm2

+    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels

+    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels

+    movdqa     [edx], xmm2

+    movdqa     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* u_buf,

+                                   const uint8* v_buf,

+                                   uint8* dst_abgr,

+                                   int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // abgr

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into ARGB

+    punpcklbw  xmm2, xmm1           // RG

+    punpcklbw  xmm0, xmm5           // BA

+    movdqa     xmm1, xmm2

+    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels

+    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels

+    movdqu     [edx], xmm2

+    movdqu     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I422ToRGBARow_SSSE3(const uint8* y_buf,

+                         const uint8* u_buf,

+                         const uint8* v_buf,

+                         uint8* dst_rgba,

+                         int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // rgba

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into RGBA

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    punpcklbw  xmm1, xmm2           // GR

+    punpcklbw  xmm5, xmm0           // AB

+    movdqa     xmm0, xmm5

+    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels

+    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels

+    movdqa     [edx], xmm5

+    movdqa     [edx + 16], xmm0

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,

+                                   const uint8* u_buf,

+                                   const uint8* v_buf,

+                                   uint8* dst_rgba,

+                                   int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // Y

+    mov        esi, [esp + 8 + 8]   // U

+    mov        edi, [esp + 8 + 12]  // V

+    mov        edx, [esp + 8 + 16]  // rgba

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        edi, esi

+    pxor       xmm4, xmm4

+    align      4

+ convertloop:

+    READYUV422

+    YUVTORGB

+    // Step 3: Weave into RGBA

+    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    punpcklbw  xmm1, xmm2           // GR

+    punpcklbw  xmm5, xmm0           // AB

+    movdqa     xmm0, xmm5

+    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels

+    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels

+    movdqu     [edx], xmm5

+    movdqu     [edx + 16], xmm0

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_I422TOARGBROW_SSSE3

+#ifdef HAS_YTOARGBROW_SSE2

+__declspec(naked) __declspec(align(16))

+void YToARGBRow_SSE2(const uint8* y_buf,

+                     uint8* rgb_buf,

+                     int width) {

+  __asm {

+    pxor       xmm5, xmm5

+    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000

+    pslld      xmm4, 24

+    mov        eax, 0x00100010

+    movd       xmm3, eax

+    pshufd     xmm3, xmm3, 0

+    mov        eax, 0x004a004a       // 74

+    movd       xmm2, eax

+    pshufd     xmm2, xmm2,0

+    mov        eax, [esp + 4]       // Y

+    mov        edx, [esp + 8]       // rgb

+    mov        ecx, [esp + 12]      // width

+    align      4

+ convertloop:

+    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164

+    movq       xmm0, qword ptr [eax]

+    lea        eax, [eax + 8]

+    punpcklbw  xmm0, xmm5           // 0.Y

+    psubusw    xmm0, xmm3

+    pmullw     xmm0, xmm2

+    psrlw      xmm0, 6

+    packuswb   xmm0, xmm0           // G

+    // Step 2: Weave into ARGB

+    punpcklbw  xmm0, xmm0           // GG

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels

+    por        xmm0, xmm4

+    por        xmm1, xmm4

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx,  [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_YTOARGBROW_SSE2

+#ifdef HAS_MIRRORROW_SSSE3

+// Shuffle table for reversing the bytes.

+static const uvec8 kShuffleMirror = {

+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

+};

+__declspec(naked) __declspec(align(16))

+void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov       eax, [esp + 4]   // src

+    mov       edx, [esp + 8]   // dst

+    mov       ecx, [esp + 12]  // width

+    movdqa    xmm5, kShuffleMirror

+    lea       eax, [eax - 16]

+    align      4

+ convertloop:

+    movdqa    xmm0, [eax + ecx]

+    pshufb    xmm0, xmm5

+    sub       ecx, 16

+    movdqa    [edx], xmm0

+    lea       edx, [edx + 16]

+    jg        convertloop

+    ret

+  }

+}

+#endif  // HAS_MIRRORROW_SSSE3

+#ifdef HAS_MIRRORROW_AVX2

+// Shuffle table for reversing the bytes.

+static const ulvec8 kShuffleMirror_AVX2 = {

+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,

+  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

+};

+__declspec(naked) __declspec(align(16))

+void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov       eax, [esp + 4]   // src

+    mov       edx, [esp + 8]   // dst

+    mov       ecx, [esp + 12]  // width

+    vmovdqa   ymm5, kShuffleMirror_AVX2

+    lea       eax, [eax - 32]

+    align      4

+ convertloop:

+    vmovdqu   ymm0, [eax + ecx]

+    vpshufb   ymm0, ymm0, ymm5

+    vpermq    ymm0, ymm0, 0x4e  // swap high and low halfs

+    sub       ecx, 32

+    vmovdqu   [edx], ymm0

+    lea       edx, [edx + 32]

+    jg        convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_MIRRORROW_AVX2

+#ifdef HAS_MIRRORROW_SSE2

+// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3

+// version can not.

+__declspec(naked) __declspec(align(16))

+void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov       eax, [esp + 4]   // src

+    mov       edx, [esp + 8]   // dst

+    mov       ecx, [esp + 12]  // width

+    lea       eax, [eax - 16]

+    align      4

+ convertloop:

+    movdqu    xmm0, [eax + ecx]

+    movdqa    xmm1, xmm0        // swap bytes

+    psllw     xmm0, 8

+    psrlw     xmm1, 8

+    por       xmm0, xmm1

+    pshuflw   xmm0, xmm0, 0x1b  // swap words

+    pshufhw   xmm0, xmm0, 0x1b

+    pshufd    xmm0, xmm0, 0x4e  // swap qwords

+    sub       ecx, 16

+    movdqu    [edx], xmm0

+    lea       edx, [edx + 16]

+    jg        convertloop

+    ret

+  }

+}

+#endif  // HAS_MIRRORROW_SSE2

+#ifdef HAS_MIRRORROW_UV_SSSE3

+// Shuffle table for reversing the bytes of UV channels.

+static const uvec8 kShuffleMirrorUV = {

+  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

+};

+__declspec(naked) __declspec(align(16))

+void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,

+                       int width) {

+  __asm {

+    push      edi

+    mov       eax, [esp + 4 + 4]   // src

+    mov       edx, [esp + 4 + 8]   // dst_u

+    mov       edi, [esp + 4 + 12]  // dst_v

+    mov       ecx, [esp + 4 + 16]  // width

+    movdqa    xmm1, kShuffleMirrorUV

+    lea       eax, [eax + ecx * 2 - 16]

+    sub       edi, edx

+    align      4

+ convertloop:

+    movdqa    xmm0, [eax]

+    lea       eax, [eax - 16]

+    pshufb    xmm0, xmm1

+    sub       ecx, 8

+    movlpd    qword ptr [edx], xmm0

+    movhpd    qword ptr [edx + edi], xmm0

+    lea       edx, [edx + 8]

+    jg        convertloop

+    pop       edi

+    ret

+  }

+}

+#endif  // HAS_MIRRORROW_UV_SSSE3

+#ifdef HAS_ARGBMIRRORROW_SSSE3

+// Shuffle table for reversing the bytes.

+static const uvec8 kARGBShuffleMirror = {

+  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u

+};

+__declspec(naked) __declspec(align(16))

+void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov       eax, [esp + 4]   // src

+    mov       edx, [esp + 8]   // dst

+    mov       ecx, [esp + 12]  // width

+    lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.

+    movdqa    xmm5, kARGBShuffleMirror

+    align      4

+ convertloop:

+    movdqa    xmm0, [eax]

+    lea       eax, [eax - 16]

+    pshufb    xmm0, xmm5

+    sub       ecx, 4

+    movdqa    [edx], xmm0

+    lea       edx, [edx + 16]

+    jg        convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBMIRRORROW_SSSE3

+#ifdef HAS_ARGBMIRRORROW_AVX2

+// Shuffle table for reversing the bytes.

+static const ulvec32 kARGBShuffleMirror_AVX2 = {

+  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

+};

+__declspec(naked) __declspec(align(16))

+void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov       eax, [esp + 4]   // src

+    mov       edx, [esp + 8]   // dst

+    mov       ecx, [esp + 12]  // width

+    lea       eax, [eax - 32]

+    vmovdqa   ymm5, kARGBShuffleMirror_AVX2

+    align      4

+ convertloop:

+    vpermd    ymm0, ymm5, [eax + ecx * 4]  // permute dword order

+    sub       ecx, 8

+    vmovdqu   [edx], ymm0

+    lea       edx, [edx + 32]

+    jg        convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBMIRRORROW_AVX2

+#ifdef HAS_SPLITUVROW_SSE2

+__declspec(naked) __declspec(align(16))

+void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_uv

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    movdqa     xmm2, xmm0

+    movdqa     xmm3, xmm1

+    pand       xmm0, xmm5   // even bytes

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    psrlw      xmm2, 8      // odd bytes

+    psrlw      xmm3, 8

+    packuswb   xmm2, xmm3

+    movdqa     [edx], xmm0

+    movdqa     [edx + edi], xmm2

+    lea        edx, [edx + 16]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+                               int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_uv

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    movdqa     xmm2, xmm0

+    movdqa     xmm3, xmm1

+    pand       xmm0, xmm5   // even bytes

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    psrlw      xmm2, 8      // odd bytes

+    psrlw      xmm3, 8

+    packuswb   xmm2, xmm3

+    movdqu     [edx], xmm0

+    movdqu     [edx + edi], xmm2

+    lea        edx, [edx + 16]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+#endif  // HAS_SPLITUVROW_SSE2

+#ifdef HAS_SPLITUVROW_AVX2

+__declspec(naked) __declspec(align(16))

+void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_uv

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    vpsrlw     ymm5, ymm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    lea        eax,  [eax + 64]

+    vpsrlw     ymm2, ymm0, 8      // odd bytes

+    vpsrlw     ymm3, ymm1, 8

+    vpand      ymm0, ymm0, ymm5   // even bytes

+    vpand      ymm1, ymm1, ymm5

+    vpackuswb  ymm0, ymm0, ymm1

+    vpackuswb  ymm2, ymm2, ymm3

+    vpermq     ymm0, ymm0, 0xd8

+    vpermq     ymm2, ymm2, 0xd8

+    vmovdqu    [edx], ymm0

+    vmovdqu    [edx + edi], ymm2

+    lea        edx, [edx + 32]

+    sub        ecx, 32

+    jg         convertloop

+    pop        edi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_SPLITUVROW_AVX2

+#ifdef HAS_MERGEUVROW_SSE2

+__declspec(naked) __declspec(align(16))

+void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_u

+    mov        edx, [esp + 4 + 8]    // src_v

+    mov        edi, [esp + 4 + 12]   // dst_uv

+    mov        ecx, [esp + 4 + 16]   // width

+    sub        edx, eax

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]      // read 16 U's

+    movdqa     xmm1, [eax + edx]  // and 16 V's

+    lea        eax,  [eax + 16]

+    movdqa     xmm2, xmm0

+    punpcklbw  xmm0, xmm1       // first 8 UV pairs

+    punpckhbw  xmm2, xmm1       // next 8 UV pairs

+    movdqa     [edi], xmm0

+    movdqa     [edi + 16], xmm2

+    lea        edi, [edi + 32]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,

+                               uint8* dst_uv, int width) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_u

+    mov        edx, [esp + 4 + 8]    // src_v

+    mov        edi, [esp + 4 + 12]   // dst_uv

+    mov        ecx, [esp + 4 + 16]   // width

+    sub        edx, eax

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]      // read 16 U's

+    movdqu     xmm1, [eax + edx]  // and 16 V's

+    lea        eax,  [eax + 16]

+    movdqa     xmm2, xmm0

+    punpcklbw  xmm0, xmm1       // first 8 UV pairs

+    punpckhbw  xmm2, xmm1       // next 8 UV pairs

+    movdqu     [edi], xmm0

+    movdqu     [edi + 16], xmm2

+    lea        edi, [edi + 32]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+#endif  //  HAS_MERGEUVROW_SSE2

+#ifdef HAS_MERGEUVROW_AVX2

+__declspec(naked) __declspec(align(16))

+void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+                     int width) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_u

+    mov        edx, [esp + 4 + 8]    // src_v

+    mov        edi, [esp + 4 + 12]   // dst_uv

+    mov        ecx, [esp + 4 + 16]   // width

+    sub        edx, eax

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]           // read 32 U's

+    vmovdqu    ymm1, [eax + edx]     // and 32 V's

+    lea        eax,  [eax + 32]

+    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2

+    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3

+    vperm2i128 ymm1, ymm2, ymm0, 0x20  // low 128 of ymm2 and low 128 of ymm0

+    vperm2i128 ymm2, ymm2, ymm0, 0x31  // high 128 of ymm2 and high 128 of ymm0

+    vmovdqu    [edi], ymm1

+    vmovdqu    [edi + 32], ymm2

+    lea        edi, [edi + 64]

+    sub        ecx, 32

+    jg         convertloop

+    pop        edi

+    vzeroupper

+    ret

+  }

+}

+#endif  //  HAS_MERGEUVROW_AVX2

+#ifdef HAS_COPYROW_SSE2

+// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.

+__declspec(naked) __declspec(align(16))

+void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {

+  __asm {

+    mov        eax, [esp + 4]   // src

+    mov        edx, [esp + 8]   // dst

+    mov        ecx, [esp + 12]  // count

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax, [eax + 32]

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    sub        ecx, 32

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_COPYROW_SSE2

+// Unaligned Multiple of 1.

+__declspec(naked) __declspec(align(16))

+void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {

+  __asm {

+    mov        eax, esi

+    mov        edx, edi

+    mov        esi, [esp + 4]   // src

+    mov        edi, [esp + 8]   // dst

+    mov        ecx, [esp + 12]  // count

+    rep movsb

+    mov        edi, edx

+    mov        esi, eax

+    ret

+  }

+}

+#ifdef HAS_COPYROW_X86

+__declspec(naked) __declspec(align(16))

+void CopyRow_X86(const uint8* src, uint8* dst, int count) {

+  __asm {

+    mov        eax, esi

+    mov        edx, edi

+    mov        esi, [esp + 4]   // src

+    mov        edi, [esp + 8]   // dst

+    mov        ecx, [esp + 12]  // count

+    shr        ecx, 2

+    rep movsd

+    mov        edi, edx

+    mov        esi, eax

+    ret

+  }

+}

+#endif  // HAS_COPYROW_X86

+#ifdef HAS_ARGBCOPYALPHAROW_SSE2

+// width in pixels

+__declspec(naked) __declspec(align(16))

+void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src

+    mov        edx, [esp + 8]   // dst

+    mov        ecx, [esp + 12]  // count

+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000

+    pslld      xmm0, 24

+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff

+    psrld      xmm1, 8

+    align      4

+  convertloop:

+    movdqa     xmm2, [eax]

+    movdqa     xmm3, [eax + 16]

+    lea        eax, [eax + 32]

+    movdqa     xmm4, [edx]

+    movdqa     xmm5, [edx + 16]

+    pand       xmm2, xmm0

+    pand       xmm3, xmm0

+    pand       xmm4, xmm1

+    pand       xmm5, xmm1

+    por        xmm2, xmm4

+    por        xmm3, xmm5

+    movdqa     [edx], xmm2

+    movdqa     [edx + 16], xmm3

+    lea        edx, [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBCOPYALPHAROW_SSE2

+#ifdef HAS_ARGBCOPYALPHAROW_AVX2

+// width in pixels

+__declspec(naked) __declspec(align(16))

+void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src

+    mov        edx, [esp + 8]   // dst

+    mov        ecx, [esp + 12]  // count

+    vpcmpeqb   ymm0, ymm0, ymm0

+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff

+    align      4

+  convertloop:

+    vmovdqu    ymm1, [eax]

+    vmovdqu    ymm2, [eax + 32]

+    lea        eax, [eax + 64]

+    vpblendvb  ymm1, ymm1, [edx], ymm0

+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0

+    vmovdqu    [edx], ymm1

+    vmovdqu    [edx + 32], ymm2

+    lea        edx, [edx + 64]

+    sub        ecx, 16

+    jg         convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBCOPYALPHAROW_AVX2

+#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

+// width in pixels

+__declspec(naked) __declspec(align(16))

+void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src

+    mov        edx, [esp + 8]   // dst

+    mov        ecx, [esp + 12]  // count

+    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000

+    pslld      xmm0, 24

+    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff

+    psrld      xmm1, 8

+    align      4

+  convertloop:

+    movq       xmm2, qword ptr [eax]  // 8 Y's

+    lea        eax, [eax + 8]

+    punpcklbw  xmm2, xmm2

+    punpckhwd  xmm3, xmm2

+    punpcklwd  xmm2, xmm2

+    movdqa     xmm4, [edx]

+    movdqa     xmm5, [edx + 16]

+    pand       xmm2, xmm0

+    pand       xmm3, xmm0

+    pand       xmm4, xmm1

+    pand       xmm5, xmm1

+    por        xmm2, xmm4

+    por        xmm3, xmm5

+    movdqa     [edx], xmm2

+    movdqa     [edx + 16], xmm3

+    lea        edx, [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2

+#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2

+// width in pixels

+__declspec(naked) __declspec(align(16))

+void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src

+    mov        edx, [esp + 8]   // dst

+    mov        ecx, [esp + 12]  // count

+    vpcmpeqb   ymm0, ymm0, ymm0

+    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff

+    align      4

+  convertloop:

+    vpmovzxbd  ymm1, qword ptr [eax]

+    vpmovzxbd  ymm2, qword ptr [eax + 8]

+    lea        eax, [eax + 16]

+    vpslld     ymm1, ymm1, 24

+    vpslld     ymm2, ymm2, 24

+    vpblendvb  ymm1, ymm1, [edx], ymm0

+    vpblendvb  ymm2, ymm2, [edx + 32], ymm0

+    vmovdqu    [edx], ymm1

+    vmovdqu    [edx + 32], ymm2

+    lea        edx, [edx + 64]

+    sub        ecx, 16

+    jg         convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2

+#ifdef HAS_SETROW_X86

+// SetRow8 writes 'count' bytes using a 32 bit value repeated.

+__declspec(naked) __declspec(align(16))

+void SetRow_X86(uint8* dst, uint32 v32, int count) {

+  __asm {

+    mov        edx, edi

+    mov        edi, [esp + 4]   // dst

+    mov        eax, [esp + 8]   // v32

+    mov        ecx, [esp + 12]  // count

+    shr        ecx, 2

+    rep stosd

+    mov        edi, edx

+    ret

+  }

+}

+// SetRow32 writes 'count' words using a 32 bit value repeated.

+__declspec(naked) __declspec(align(16))

+void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,

+                   int dst_stride, int height) {

+  __asm {

+    push       esi

+    push       edi

+    push       ebp

+    mov        edi, [esp + 12 + 4]   // dst

+    mov        eax, [esp + 12 + 8]   // v32

+    mov        ebp, [esp + 12 + 12]  // width

+    mov        edx, [esp + 12 + 16]  // dst_stride

+    mov        esi, [esp + 12 + 20]  // height

+    lea        ecx, [ebp * 4]

+    sub        edx, ecx             // stride - width * 4

+    align      4

+  convertloop:

+    mov        ecx, ebp

+    rep stosd

+    add        edi, edx

+    sub        esi, 1

+    jg         convertloop

+    pop        ebp

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_SETROW_X86

+#ifdef HAS_YUY2TOYROW_AVX2

+__declspec(naked) __declspec(align(16))

+void YUY2ToYRow_AVX2(const uint8* src_yuy2,

+                     uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_yuy2

+    mov        edx, [esp + 8]    // dst_y

+    mov        ecx, [esp + 12]   // pix

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

+    vpsrlw     ymm5, ymm5, 8

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    lea        eax,  [eax + 64]

+    vpand      ymm0, ymm0, ymm5   // even bytes are Y

+    vpand      ymm1, ymm1, ymm5

+    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpermq     ymm0, ymm0, 0xd8

+    sub        ecx, 32

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    jg         convertloop

+    vzeroupper

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_yuy2

+    mov        esi, [esp + 8 + 8]    // stride_yuy2

+    mov        edx, [esp + 8 + 12]   // dst_u

+    mov        edi, [esp + 8 + 16]   // dst_v

+    mov        ecx, [esp + 8 + 20]   // pix

+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    vpsrlw     ymm5, ymm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    vpavgb     ymm0, ymm0, [eax + esi]

+    vpavgb     ymm1, ymm1, [eax + esi + 32]

+    lea        eax,  [eax + 64]

+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV

+    vpsrlw     ymm1, ymm1, 8

+    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpermq     ymm0, ymm0, 0xd8

+    vpand      ymm1, ymm0, ymm5  // U

+    vpsrlw     ymm0, ymm0, 8     // V

+    vpackuswb  ymm1, ymm1, ymm1  // mutates.

+    vpackuswb  ymm0, ymm0, ymm0  // mutates.

+    vpermq     ymm1, ymm1, 0xd8

+    vpermq     ymm0, ymm0, 0xd8

+    vextractf128 [edx], ymm1, 0  // U

+    vextractf128 [edx + edi], ymm0, 0 // V

+    lea        edx, [edx + 16]

+    sub        ecx, 32

+    jg         convertloop

+    pop        edi

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

+                         uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_yuy2

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    vpsrlw     ymm5, ymm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    lea        eax,  [eax + 64]

+    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV

+    vpsrlw     ymm1, ymm1, 8

+    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpermq     ymm0, ymm0, 0xd8

+    vpand      ymm1, ymm0, ymm5  // U

+    vpsrlw     ymm0, ymm0, 8     // V

+    vpackuswb  ymm1, ymm1, ymm1  // mutates.

+    vpackuswb  ymm0, ymm0, ymm0  // mutates.

+    vpermq     ymm1, ymm1, 0xd8

+    vpermq     ymm0, ymm0, 0xd8

+    vextractf128 [edx], ymm1, 0  // U

+    vextractf128 [edx + edi], ymm0, 0 // V

+    lea        edx, [edx + 16]

+    sub        ecx, 32

+    jg         convertloop

+    pop        edi

+    vzeroupper

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToYRow_AVX2(const uint8* src_uyvy,

+                     uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_uyvy

+    mov        edx, [esp + 8]    // dst_y

+    mov        ecx, [esp + 12]   // pix

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    lea        eax,  [eax + 64]

+    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y

+    vpsrlw     ymm1, ymm1, 8

+    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpermq     ymm0, ymm0, 0xd8

+    sub        ecx, 32

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    jg         convertloop

+    ret

+    vzeroupper

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_yuy2

+    mov        esi, [esp + 8 + 8]    // stride_yuy2

+    mov        edx, [esp + 8 + 12]   // dst_u

+    mov        edi, [esp + 8 + 16]   // dst_v

+    mov        ecx, [esp + 8 + 20]   // pix

+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    vpsrlw     ymm5, ymm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    vpavgb     ymm0, ymm0, [eax + esi]

+    vpavgb     ymm1, ymm1, [eax + esi + 32]

+    lea        eax,  [eax + 64]

+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV

+    vpand      ymm1, ymm1, ymm5

+    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpermq     ymm0, ymm0, 0xd8

+    vpand      ymm1, ymm0, ymm5  // U

+    vpsrlw     ymm0, ymm0, 8     // V

+    vpackuswb  ymm1, ymm1, ymm1  // mutates.

+    vpackuswb  ymm0, ymm0, ymm0  // mutates.

+    vpermq     ymm1, ymm1, 0xd8

+    vpermq     ymm0, ymm0, 0xd8

+    vextractf128 [edx], ymm1, 0  // U

+    vextractf128 [edx + edi], ymm0, 0 // V

+    lea        edx, [edx + 16]

+    sub        ecx, 32

+    jg         convertloop

+    pop        edi

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_yuy2

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    vpsrlw     ymm5, ymm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    lea        eax,  [eax + 64]

+    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV

+    vpand      ymm1, ymm1, ymm5

+    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpermq     ymm0, ymm0, 0xd8

+    vpand      ymm1, ymm0, ymm5  // U

+    vpsrlw     ymm0, ymm0, 8     // V

+    vpackuswb  ymm1, ymm1, ymm1  // mutates.

+    vpackuswb  ymm0, ymm0, ymm0  // mutates.

+    vpermq     ymm1, ymm1, 0xd8

+    vpermq     ymm0, ymm0, 0xd8

+    vextractf128 [edx], ymm1, 0  // U

+    vextractf128 [edx + edi], ymm0, 0 // V

+    lea        edx, [edx + 16]

+    sub        ecx, 32

+    jg         convertloop

+    pop        edi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_YUY2TOYROW_AVX2

+#ifdef HAS_YUY2TOYROW_SSE2

+__declspec(naked) __declspec(align(16))

+void YUY2ToYRow_SSE2(const uint8* src_yuy2,

+                     uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_yuy2

+    mov        edx, [esp + 8]    // dst_y

+    mov        ecx, [esp + 12]   // pix

+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    pand       xmm0, xmm5   // even bytes are Y

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_yuy2

+    mov        esi, [esp + 8 + 8]    // stride_yuy2

+    mov        edx, [esp + 8 + 12]   // dst_u

+    mov        edi, [esp + 8 + 16]   // dst_v

+    mov        ecx, [esp + 8 + 20]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + esi]

+    movdqa     xmm3, [eax + esi + 16]

+    lea        eax,  [eax + 32]

+    pavgb      xmm0, xmm2

+    pavgb      xmm1, xmm3

+    psrlw      xmm0, 8      // YUYV -> UVUV

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

+                         uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_yuy2

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    psrlw      xmm0, 8      // YUYV -> UVUV

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,

+                               uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_yuy2

+    mov        edx, [esp + 8]    // dst_y

+    mov        ecx, [esp + 12]   // pix

+    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    pand       xmm0, xmm5   // even bytes are Y

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,

+                                uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_yuy2

+    mov        esi, [esp + 8 + 8]    // stride_yuy2

+    mov        edx, [esp + 8 + 12]   // dst_u

+    mov        edi, [esp + 8 + 16]   // dst_v

+    mov        ecx, [esp + 8 + 20]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + esi]

+    movdqu     xmm3, [eax + esi + 16]

+    lea        eax,  [eax + 32]

+    pavgb      xmm0, xmm2

+    pavgb      xmm1, xmm3

+    psrlw      xmm0, 8      // YUYV -> UVUV

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,

+                                   uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_yuy2

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    psrlw      xmm0, 8      // YUYV -> UVUV

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToYRow_SSE2(const uint8* src_uyvy,

+                     uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_uyvy

+    mov        edx, [esp + 8]    // dst_y

+    mov        ecx, [esp + 12]   // pix

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    psrlw      xmm0, 8    // odd bytes are Y

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

+                      uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_yuy2

+    mov        esi, [esp + 8 + 8]    // stride_yuy2

+    mov        edx, [esp + 8 + 12]   // dst_u

+    mov        edi, [esp + 8 + 16]   // dst_v

+    mov        ecx, [esp + 8 + 20]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + esi]

+    movdqa     xmm3, [eax + esi + 16]

+    lea        eax,  [eax + 32]

+    pavgb      xmm0, xmm2

+    pavgb      xmm1, xmm3

+    pand       xmm0, xmm5   // UYVY -> UVUV

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

+                         uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_yuy2

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    pand       xmm0, xmm5   // UYVY -> UVUV

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,

+                               uint8* dst_y, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_uyvy

+    mov        edx, [esp + 8]    // dst_y

+    mov        ecx, [esp + 12]   // pix

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    psrlw      xmm0, 8    // odd bytes are Y

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,

+                                uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_yuy2

+    mov        esi, [esp + 8 + 8]    // stride_yuy2

+    mov        edx, [esp + 8 + 12]   // dst_u

+    mov        edi, [esp + 8 + 16]   // dst_v

+    mov        ecx, [esp + 8 + 20]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + esi]

+    movdqu     xmm3, [eax + esi + 16]

+    lea        eax,  [eax + 32]

+    pavgb      xmm0, xmm2

+    pavgb      xmm1, xmm3

+    pand       xmm0, xmm5   // UYVY -> UVUV

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,

+                                   uint8* dst_u, uint8* dst_v, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_yuy2

+    mov        edx, [esp + 4 + 8]    // dst_u

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    sub        edi, edx

+    align      4

+  convertloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    pand       xmm0, xmm5   // UYVY -> UVUV

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    movdqa     xmm1, xmm0

+    pand       xmm0, xmm5  // U

+    packuswb   xmm0, xmm0

+    psrlw      xmm1, 8     // V

+    packuswb   xmm1, xmm1

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + edi], xmm1

+    lea        edx, [edx + 8]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+#endif  // HAS_YUY2TOYROW_SSE2

+#ifdef HAS_ARGBBLENDROW_SSE2

+// Blend 8 pixels at a time.

+__declspec(naked) __declspec(align(16))

+void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                       uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm7, xmm7       // generate constant 1

+    psrlw      xmm7, 15

+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff

+    psrlw      xmm6, 8

+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00

+    psllw      xmm5, 8

+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000

+    pslld      xmm4, 24

+    sub        ecx, 1

+    je         convertloop1     // only 1 pixel?

+    jl         convertloop1b

+    // 1 pixel loop until destination pointer is aligned.

+  alignloop1:

+    test       edx, 15          // aligned?

+    je         alignloop1b

+    movd       xmm3, [eax]

+    lea        eax, [eax + 4]

+    movdqa     xmm0, xmm3       // src argb

+    pxor       xmm3, xmm4       // ~alpha

+    movd       xmm2, [esi]      // _r_b

+    psrlw      xmm3, 8          // alpha

+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words

+    pshuflw    xmm3, xmm3, 0F5h

+    pand       xmm2, xmm6       // _r_b

+    paddw      xmm3, xmm7       // 256 - alpha

+    pmullw     xmm2, xmm3       // _r_b * alpha

+    movd       xmm1, [esi]      // _a_g

+    lea        esi, [esi + 4]

+    psrlw      xmm1, 8          // _a_g

+    por        xmm0, xmm4       // set alpha to 255

+    pmullw     xmm1, xmm3       // _a_g * alpha

+    psrlw      xmm2, 8          // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2       // + src argb

+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1       // + src argb

+    sub        ecx, 1

+    movd       [edx], xmm0

+    lea        edx, [edx + 4]

+    jge        alignloop1

+  alignloop1b:

+    add        ecx, 1 - 4

+    jl         convertloop4b

+    // 4 pixel loop.

+  convertloop4:

+    movdqu     xmm3, [eax]      // src argb

+    lea        eax, [eax + 16]

+    movdqa     xmm0, xmm3       // src argb

+    pxor       xmm3, xmm4       // ~alpha

+    movdqu     xmm2, [esi]      // _r_b

+    psrlw      xmm3, 8          // alpha

+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words

+    pshuflw    xmm3, xmm3, 0F5h

+    pand       xmm2, xmm6       // _r_b

+    paddw      xmm3, xmm7       // 256 - alpha

+    pmullw     xmm2, xmm3       // _r_b * alpha

+    movdqu     xmm1, [esi]      // _a_g

+    lea        esi, [esi + 16]

+    psrlw      xmm1, 8          // _a_g

+    por        xmm0, xmm4       // set alpha to 255

+    pmullw     xmm1, xmm3       // _a_g * alpha

+    psrlw      xmm2, 8          // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2       // + src argb

+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1       // + src argb

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jge        convertloop4

+  convertloop4b:

+    add        ecx, 4 - 1

+    jl         convertloop1b

+    // 1 pixel loop.

+  convertloop1:

+    movd       xmm3, [eax]      // src argb

+    lea        eax, [eax + 4]

+    movdqa     xmm0, xmm3       // src argb

+    pxor       xmm3, xmm4       // ~alpha

+    movd       xmm2, [esi]      // _r_b

+    psrlw      xmm3, 8          // alpha

+    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words

+    pshuflw    xmm3, xmm3, 0F5h

+    pand       xmm2, xmm6       // _r_b

+    paddw      xmm3, xmm7       // 256 - alpha

+    pmullw     xmm2, xmm3       // _r_b * alpha

+    movd       xmm1, [esi]      // _a_g

+    lea        esi, [esi + 4]

+    psrlw      xmm1, 8          // _a_g

+    por        xmm0, xmm4       // set alpha to 255

+    pmullw     xmm1, xmm3       // _a_g * alpha

+    psrlw      xmm2, 8          // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2       // + src argb

+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1       // + src argb

+    sub        ecx, 1

+    movd       [edx], xmm0

+    lea        edx, [edx + 4]

+    jge        convertloop1

+  convertloop1b:

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBBLENDROW_SSE2

+#ifdef HAS_ARGBBLENDROW_SSSE3

+// Shuffle table for isolating alpha.

+static const uvec8 kShuffleAlpha = {

+  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,

+  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

+};

+// Same as SSE2, but replaces:

+//    psrlw      xmm3, 8          // alpha

+//    pshufhw    xmm3, xmm3, 0F5h // 8 alpha words

+//    pshuflw    xmm3, xmm3, 0F5h

+// with..

+//    pshufb     xmm3, kShuffleAlpha // alpha

+// Blend 8 pixels at a time.

+__declspec(naked) __declspec(align(16))

+void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

+                        uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm7, xmm7       // generate constant 0x0001

+    psrlw      xmm7, 15

+    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff

+    psrlw      xmm6, 8

+    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00

+    psllw      xmm5, 8

+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000

+    pslld      xmm4, 24

+    sub        ecx, 1

+    je         convertloop1     // only 1 pixel?

+    jl         convertloop1b

+    // 1 pixel loop until destination pointer is aligned.

+  alignloop1:

+    test       edx, 15          // aligned?

+    je         alignloop1b

+    movd       xmm3, [eax]

+    lea        eax, [eax + 4]

+    movdqa     xmm0, xmm3       // src argb

+    pxor       xmm3, xmm4       // ~alpha

+    movd       xmm2, [esi]      // _r_b

+    pshufb     xmm3, kShuffleAlpha // alpha

+    pand       xmm2, xmm6       // _r_b

+    paddw      xmm3, xmm7       // 256 - alpha

+    pmullw     xmm2, xmm3       // _r_b * alpha

+    movd       xmm1, [esi]      // _a_g

+    lea        esi, [esi + 4]

+    psrlw      xmm1, 8          // _a_g

+    por        xmm0, xmm4       // set alpha to 255

+    pmullw     xmm1, xmm3       // _a_g * alpha

+    psrlw      xmm2, 8          // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2       // + src argb

+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1       // + src argb

+    sub        ecx, 1

+    movd       [edx], xmm0

+    lea        edx, [edx + 4]

+    jge        alignloop1

+  alignloop1b:

+    add        ecx, 1 - 4

+    jl         convertloop4b

+    test       eax, 15          // unaligned?

+    jne        convertuloop4

+    test       esi, 15          // unaligned?

+    jne        convertuloop4

+    // 4 pixel loop.

+  convertloop4:

+    movdqa     xmm3, [eax]      // src argb

+    lea        eax, [eax + 16]

+    movdqa     xmm0, xmm3       // src argb

+    pxor       xmm3, xmm4       // ~alpha

+    movdqa     xmm2, [esi]      // _r_b

+    pshufb     xmm3, kShuffleAlpha // alpha

+    pand       xmm2, xmm6       // _r_b

+    paddw      xmm3, xmm7       // 256 - alpha

+    pmullw     xmm2, xmm3       // _r_b * alpha

+    movdqa     xmm1, [esi]      // _a_g

+    lea        esi, [esi + 16]

+    psrlw      xmm1, 8          // _a_g

+    por        xmm0, xmm4       // set alpha to 255

+    pmullw     xmm1, xmm3       // _a_g * alpha

+    psrlw      xmm2, 8          // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2       // + src argb

+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1       // + src argb

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jge        convertloop4

+    jmp        convertloop4b

+    // 4 pixel unaligned loop.

+  convertuloop4:

+    movdqu     xmm3, [eax]      // src argb

+    lea        eax, [eax + 16]

+    movdqa     xmm0, xmm3       // src argb

+    pxor       xmm3, xmm4       // ~alpha

+    movdqu     xmm2, [esi]      // _r_b

+    pshufb     xmm3, kShuffleAlpha // alpha

+    pand       xmm2, xmm6       // _r_b

+    paddw      xmm3, xmm7       // 256 - alpha

+    pmullw     xmm2, xmm3       // _r_b * alpha

+    movdqu     xmm1, [esi]      // _a_g

+    lea        esi, [esi + 16]

+    psrlw      xmm1, 8          // _a_g

+    por        xmm0, xmm4       // set alpha to 255

+    pmullw     xmm1, xmm3       // _a_g * alpha

+    psrlw      xmm2, 8          // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2       // + src argb

+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1       // + src argb

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jge        convertuloop4

+  convertloop4b:

+    add        ecx, 4 - 1

+    jl         convertloop1b

+    // 1 pixel loop.

+  convertloop1:

+    movd       xmm3, [eax]      // src argb

+    lea        eax, [eax + 4]

+    movdqa     xmm0, xmm3       // src argb

+    pxor       xmm3, xmm4       // ~alpha

+    movd       xmm2, [esi]      // _r_b

+    pshufb     xmm3, kShuffleAlpha // alpha

+    pand       xmm2, xmm6       // _r_b

+    paddw      xmm3, xmm7       // 256 - alpha

+    pmullw     xmm2, xmm3       // _r_b * alpha

+    movd       xmm1, [esi]      // _a_g

+    lea        esi, [esi + 4]

+    psrlw      xmm1, 8          // _a_g

+    por        xmm0, xmm4       // set alpha to 255

+    pmullw     xmm1, xmm3       // _a_g * alpha

+    psrlw      xmm2, 8          // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2       // + src argb

+    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1       // + src argb

+    sub        ecx, 1

+    movd       [edx], xmm0

+    lea        edx, [edx + 4]

+    jge        convertloop1

+  convertloop1b:

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBBLENDROW_SSSE3

+#ifdef HAS_ARGBATTENUATEROW_SSE2

+// Attenuate 4 pixels at a time.

+// Aligned to 16 bytes.

+__declspec(naked) __declspec(align(16))

+void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src_argb0

+    mov        edx, [esp + 8]   // dst_argb

+    mov        ecx, [esp + 12]  // width

+    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000

+    pslld      xmm4, 24

+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff

+    psrld      xmm5, 8

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]      // read 4 pixels

+    punpcklbw  xmm0, xmm0       // first 2

+    pshufhw    xmm2, xmm0, 0FFh // 8 alpha words

+    pshuflw    xmm2, xmm2, 0FFh

+    pmulhuw    xmm0, xmm2       // rgb * a

+    movdqa     xmm1, [eax]      // read 4 pixels

+    punpckhbw  xmm1, xmm1       // next 2 pixels

+    pshufhw    xmm2, xmm1, 0FFh // 8 alpha words

+    pshuflw    xmm2, xmm2, 0FFh

+    pmulhuw    xmm1, xmm2       // rgb * a

+    movdqa     xmm2, [eax]      // alphas

+    lea        eax, [eax + 16]

+    psrlw      xmm0, 8

+    pand       xmm2, xmm4

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    pand       xmm0, xmm5       // keep original alphas

+    por        xmm0, xmm2

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBATTENUATEROW_SSE2

+#ifdef HAS_ARGBATTENUATEROW_SSSE3

+// Shuffle table duplicating alpha.

+static const uvec8 kShuffleAlpha0 = {

+  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,

+};

+static const uvec8 kShuffleAlpha1 = {

+  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

+  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,

+};

+__declspec(naked) __declspec(align(16))

+void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src_argb0

+    mov        edx, [esp + 8]   // dst_argb

+    mov        ecx, [esp + 12]  // width

+    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000

+    pslld      xmm3, 24

+    movdqa     xmm4, kShuffleAlpha0

+    movdqa     xmm5, kShuffleAlpha1

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]      // read 4 pixels

+    pshufb     xmm0, xmm4       // isolate first 2 alphas

+    movdqu     xmm1, [eax]      // read 4 pixels

+    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs

+    pmulhuw    xmm0, xmm1       // rgb * a

+    movdqu     xmm1, [eax]      // read 4 pixels

+    pshufb     xmm1, xmm5       // isolate next 2 alphas

+    movdqu     xmm2, [eax]      // read 4 pixels

+    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs

+    pmulhuw    xmm1, xmm2       // rgb * a

+    movdqu     xmm2, [eax]      // mask original alpha

+    lea        eax, [eax + 16]

+    pand       xmm2, xmm3

+    psrlw      xmm0, 8

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    por        xmm0, xmm2       // copy original alpha

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBATTENUATEROW_SSSE3

+#ifdef HAS_ARGBATTENUATEROW_AVX2

+// Shuffle table duplicating alpha.

+static const ulvec8 kShuffleAlpha_AVX2 = {

+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,

+  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,

+  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,

+  14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,

+};

+__declspec(naked) __declspec(align(16))

+void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src_argb0

+    mov        edx, [esp + 8]   // dst_argb

+    mov        ecx, [esp + 12]  // width

+    sub        edx, eax

+    vmovdqa    ymm4, kShuffleAlpha_AVX2

+    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000

+    vpslld     ymm5, ymm5, 24

+    align      4

+ convertloop:

+    vmovdqu    ymm6, [eax]       // read 8 pixels.

+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.

+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.

+    vpshufb    ymm2, ymm0, ymm4  // low 4 alphas

+    vpshufb    ymm3, ymm1, ymm4  // high 4 alphas

+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * a

+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * a

+    vpand      ymm6, ymm6, ymm5  // isolate alpha

+    vpsrlw     ymm0, ymm0, 8

+    vpsrlw     ymm1, ymm1, 8

+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.

+    vpor       ymm0, ymm0, ymm6  // copy original alpha

+    sub        ecx, 8

+    vmovdqu    [eax + edx], ymm0

+    lea        eax, [eax + 32]

+    jg         convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBATTENUATEROW_AVX2

+#ifdef HAS_ARGBUNATTENUATEROW_SSE2

+// Unattenuate 4 pixels at a time.

+// Aligned to 16 bytes.

+__declspec(naked) __declspec(align(16))

+void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,

+                             int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_argb0

+    mov        edx, [esp + 8 + 8]   // dst_argb

+    mov        ecx, [esp + 8 + 12]  // width

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]      // read 4 pixels

+    movzx      esi, byte ptr [eax + 3]  // first alpha

+    movzx      edi, byte ptr [eax + 7]  // second alpha

+    punpcklbw  xmm0, xmm0       // first 2

+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]

+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]

+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a

+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words

+    movlhps    xmm2, xmm3

+    pmulhuw    xmm0, xmm2       // rgb * a

+    movdqu     xmm1, [eax]      // read 4 pixels

+    movzx      esi, byte ptr [eax + 11]  // third alpha

+    movzx      edi, byte ptr [eax + 15]  // forth alpha

+    punpckhbw  xmm1, xmm1       // next 2

+    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]

+    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]

+    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words

+    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words

+    movlhps    xmm2, xmm3

+    pmulhuw    xmm1, xmm2       // rgb * a

+    lea        eax, [eax + 16]

+    packuswb   xmm0, xmm1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBUNATTENUATEROW_SSE2

+#ifdef HAS_ARGBUNATTENUATEROW_AVX2

+// Shuffle table duplicating alpha.

+static const ulvec8 kUnattenShuffleAlpha_AVX2 = {

+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,

+  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,

+};

+// TODO(fbarchard): Enable USE_GATHER for future hardware if faster.

+// USE_GATHER is not on by default, due to being a slow instruction.

+#ifdef USE_GATHER

+__declspec(naked) __declspec(align(16))

+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

+                             int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src_argb0

+    mov        edx, [esp + 8]   // dst_argb

+    mov        ecx, [esp + 12]  // width

+    sub        edx, eax

+    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2

+    align      4

+ convertloop:

+    vmovdqu    ymm6, [eax]       // read 8 pixels.

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.

+    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.

+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.

+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.

+    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a

+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a

+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.

+    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas. 1, a, a, a

+    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas

+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia

+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia

+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.

+    sub        ecx, 8

+    vmovdqu    [eax + edx], ymm0

+    lea        eax, [eax + 32]

+    jg         convertloop

+    vzeroupper

+    ret

+  }

+}

+#else  // USE_GATHER

+__declspec(naked) __declspec(align(16))

+void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

+                             int width) {

+  __asm {

+    mov        eax, [esp + 4]   // src_argb0

+    mov        edx, [esp + 8]   // dst_argb

+    mov        ecx, [esp + 12]  // width

+    sub        edx, eax

+    vmovdqa    ymm5, kUnattenShuffleAlpha_AVX2

+    push       esi

+    push       edi

+    align      4

+ convertloop:

+    // replace VPGATHER

+    movzx      esi, byte ptr [eax + 3]                 // alpha0

+    movzx      edi, byte ptr [eax + 7]                 // alpha1

+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a0]

+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a1]

+    movzx      esi, byte ptr [eax + 11]                // alpha2

+    movzx      edi, byte ptr [eax + 15]                // alpha3

+    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]

+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a2]

+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a3]

+    movzx      esi, byte ptr [eax + 19]                // alpha4

+    movzx      edi, byte ptr [eax + 23]                // alpha5

+    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]

+    vmovd      xmm0, dword ptr fixed_invtbl8[esi * 4]  // [1,a4]

+    vmovd      xmm1, dword ptr fixed_invtbl8[edi * 4]  // [1,a5]

+    movzx      esi, byte ptr [eax + 27]                // alpha6

+    movzx      edi, byte ptr [eax + 31]                // alpha7

+    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]

+    vmovd      xmm2, dword ptr fixed_invtbl8[esi * 4]  // [1,a6]

+    vmovd      xmm3, dword ptr fixed_invtbl8[edi * 4]  // [1,a7]

+    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]

+    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]

+    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]

+    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]

+    // end of VPGATHER

+    vmovdqu    ymm6, [eax]       // read 8 pixels.

+    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.

+    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.

+    vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a

+    vpunpckhwd ymm3, ymm3, ymm3  // high 4 inverted alphas. mutated.

+    vpshufb    ymm2, ymm2, ymm5  // replicate low 4 alphas. 1, a, a, a

+    vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas

+    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia

+    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia

+    vpackuswb  ymm0, ymm0, ymm1  // unmutated.

+    sub        ecx, 8

+    vmovdqu    [eax + edx], ymm0

+    lea        eax, [eax + 32]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // USE_GATHER

+#endif  // HAS_ARGBATTENUATEROW_AVX2

+#ifdef HAS_ARGBGRAYROW_SSSE3

+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.

+__declspec(naked) __declspec(align(16))

+void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_argb */

+    mov        ecx, [esp + 12]  /* width */

+    movdqa     xmm4, kARGBToYJ

+    movdqa     xmm5, kAddYJ64

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]  // G

+    movdqa     xmm1, [eax + 16]

+    pmaddubsw  xmm0, xmm4

+    pmaddubsw  xmm1, xmm4

+    phaddw     xmm0, xmm1

+    paddw      xmm0, xmm5  // Add .5 for rounding.

+    psrlw      xmm0, 7

+    packuswb   xmm0, xmm0   // 8 G bytes

+    movdqa     xmm2, [eax]  // A

+    movdqa     xmm3, [eax + 16]

+    lea        eax, [eax + 32]

+    psrld      xmm2, 24

+    psrld      xmm3, 24

+    packuswb   xmm2, xmm3

+    packuswb   xmm2, xmm2   // 8 A bytes

+    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA

+    punpcklbw  xmm0, xmm0   // 8 GG words

+    punpcklbw  xmm3, xmm2   // 8 GA words

+    movdqa     xmm1, xmm0

+    punpcklwd  xmm0, xmm3   // GGGA first 4

+    punpckhwd  xmm1, xmm3   // GGGA next 4

+    sub        ecx, 8

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBGRAYROW_SSSE3

+#ifdef HAS_ARGBSEPIAROW_SSSE3

+//    b = (r * 35 + g * 68 + b * 17) >> 7

+//    g = (r * 45 + g * 88 + b * 22) >> 7

+//    r = (r * 50 + g * 98 + b * 24) >> 7

+// Constant for ARGB color to sepia tone.

+static const vec8 kARGBToSepiaB = {

+  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0

+};

+static const vec8 kARGBToSepiaG = {

+  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0

+};

+static const vec8 kARGBToSepiaR = {

+  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0

+};

+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

+__declspec(naked) __declspec(align(16))

+void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {

+  __asm {

+    mov        eax, [esp + 4]   /* dst_argb */

+    mov        ecx, [esp + 8]   /* width */

+    movdqa     xmm2, kARGBToSepiaB

+    movdqa     xmm3, kARGBToSepiaG

+    movdqa     xmm4, kARGBToSepiaR

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]  // B

+    movdqa     xmm6, [eax + 16]

+    pmaddubsw  xmm0, xmm2

+    pmaddubsw  xmm6, xmm2

+    phaddw     xmm0, xmm6

+    psrlw      xmm0, 7

+    packuswb   xmm0, xmm0   // 8 B values

+    movdqa     xmm5, [eax]  // G

+    movdqa     xmm1, [eax + 16]

+    pmaddubsw  xmm5, xmm3

+    pmaddubsw  xmm1, xmm3

+    phaddw     xmm5, xmm1

+    psrlw      xmm5, 7

+    packuswb   xmm5, xmm5   // 8 G values

+    punpcklbw  xmm0, xmm5   // 8 BG values

+    movdqa     xmm5, [eax]  // R

+    movdqa     xmm1, [eax + 16]

+    pmaddubsw  xmm5, xmm4

+    pmaddubsw  xmm1, xmm4

+    phaddw     xmm5, xmm1

+    psrlw      xmm5, 7

+    packuswb   xmm5, xmm5   // 8 R values

+    movdqa     xmm6, [eax]  // A

+    movdqa     xmm1, [eax + 16]

+    psrld      xmm6, 24

+    psrld      xmm1, 24

+    packuswb   xmm6, xmm1

+    packuswb   xmm6, xmm6   // 8 A values

+    punpcklbw  xmm5, xmm6   // 8 RA values

+    movdqa     xmm1, xmm0   // Weave BG, RA together

+    punpcklwd  xmm0, xmm5   // BGRA first 4

+    punpckhwd  xmm1, xmm5   // BGRA next 4

+    sub        ecx, 8

+    movdqa     [eax], xmm0

+    movdqa     [eax + 16], xmm1

+    lea        eax, [eax + 32]

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBSEPIAROW_SSSE3

+#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3

+// Tranform 8 ARGB pixels (32 bytes) with color matrix.

+// Same as Sepia except matrix is provided.

+// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R

+// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.

+__declspec(naked) __declspec(align(16))

+void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                              const int8* matrix_argb, int width) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_argb */

+    mov        ecx, [esp + 12]  /* matrix_argb */

+    movdqu     xmm5, [ecx]

+    pshufd     xmm2, xmm5, 0x00

+    pshufd     xmm3, xmm5, 0x55

+    pshufd     xmm4, xmm5, 0xaa

+    pshufd     xmm5, xmm5, 0xff

+    mov        ecx, [esp + 16]  /* width */

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]  // B

+    movdqa     xmm7, [eax + 16]

+    pmaddubsw  xmm0, xmm2

+    pmaddubsw  xmm7, xmm2

+    movdqa     xmm6, [eax]  // G

+    movdqa     xmm1, [eax + 16]

+    pmaddubsw  xmm6, xmm3

+    pmaddubsw  xmm1, xmm3

+    phaddsw    xmm0, xmm7   // B

+    phaddsw    xmm6, xmm1   // G

+    psraw      xmm0, 6      // B

+    psraw      xmm6, 6      // G

+    packuswb   xmm0, xmm0   // 8 B values

+    packuswb   xmm6, xmm6   // 8 G values

+    punpcklbw  xmm0, xmm6   // 8 BG values

+    movdqa     xmm1, [eax]  // R

+    movdqa     xmm7, [eax + 16]

+    pmaddubsw  xmm1, xmm4

+    pmaddubsw  xmm7, xmm4

+    phaddsw    xmm1, xmm7   // R

+    movdqa     xmm6, [eax]  // A

+    movdqa     xmm7, [eax + 16]

+    pmaddubsw  xmm6, xmm5

+    pmaddubsw  xmm7, xmm5

+    phaddsw    xmm6, xmm7   // A

+    psraw      xmm1, 6      // R

+    psraw      xmm6, 6      // A

+    packuswb   xmm1, xmm1   // 8 R values

+    packuswb   xmm6, xmm6   // 8 A values

+    punpcklbw  xmm1, xmm6   // 8 RA values

+    movdqa     xmm6, xmm0   // Weave BG, RA together

+    punpcklwd  xmm0, xmm1   // BGRA first 4

+    punpckhwd  xmm6, xmm1   // BGRA next 4

+    sub        ecx, 8

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm6

+    lea        eax, [eax + 32]

+    lea        edx, [edx + 32]

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3

+#ifdef HAS_ARGBQUANTIZEROW_SSE2

+// Quantize 4 ARGB pixels (16 bytes).

+// Aligned to 16 bytes.

+__declspec(naked) __declspec(align(16))

+void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,

+                          int interval_offset, int width) {

+  __asm {

+    mov        eax, [esp + 4]    /* dst_argb */

+    movd       xmm2, [esp + 8]   /* scale */

+    movd       xmm3, [esp + 12]  /* interval_size */

+    movd       xmm4, [esp + 16]  /* interval_offset */

+    mov        ecx, [esp + 20]   /* width */

+    pshuflw    xmm2, xmm2, 040h

+    pshufd     xmm2, xmm2, 044h

+    pshuflw    xmm3, xmm3, 040h

+    pshufd     xmm3, xmm3, 044h

+    pshuflw    xmm4, xmm4, 040h

+    pshufd     xmm4, xmm4, 044h

+    pxor       xmm5, xmm5  // constant 0

+    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000

+    pslld      xmm6, 24

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]  // read 4 pixels

+    punpcklbw  xmm0, xmm5   // first 2 pixels

+    pmulhuw    xmm0, xmm2   // pixel * scale >> 16

+    movdqa     xmm1, [eax]  // read 4 pixels

+    punpckhbw  xmm1, xmm5   // next 2 pixels

+    pmulhuw    xmm1, xmm2

+    pmullw     xmm0, xmm3   // * interval_size

+    movdqa     xmm7, [eax]  // read 4 pixels

+    pmullw     xmm1, xmm3

+    pand       xmm7, xmm6   // mask alpha

+    paddw      xmm0, xmm4   // + interval_size / 2

+    paddw      xmm1, xmm4

+    packuswb   xmm0, xmm1

+    por        xmm0, xmm7

+    sub        ecx, 4

+    movdqa     [eax], xmm0

+    lea        eax, [eax + 16]

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBQUANTIZEROW_SSE2

+#ifdef HAS_ARGBSHADEROW_SSE2

+// Shade 4 pixels at a time by specified value.

+// Aligned to 16 bytes.

+__declspec(naked) __declspec(align(16))

+void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,

+                       uint32 value) {

+  __asm {

+    mov        eax, [esp + 4]   // src_argb

+    mov        edx, [esp + 8]   // dst_argb

+    mov        ecx, [esp + 12]  // width

+    movd       xmm2, [esp + 16]  // value

+    punpcklbw  xmm2, xmm2

+    punpcklqdq xmm2, xmm2

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]      // read 4 pixels

+    lea        eax, [eax + 16]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm0       // first 2

+    punpckhbw  xmm1, xmm1       // next 2

+    pmulhuw    xmm0, xmm2       // argb * value

+    pmulhuw    xmm1, xmm2       // argb * value

+    psrlw      xmm0, 8

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    ret

+  }

+}

+#endif  // HAS_ARGBSHADEROW_SSE2

+#ifdef HAS_ARGBMULTIPLYROW_SSE2

+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.

+__declspec(naked) __declspec(align(16))

+void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    pxor       xmm5, xmm5  // constant 0

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0

+    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1

+    movdqu     xmm1, xmm0

+    movdqu     xmm3, xmm2

+    punpcklbw  xmm0, xmm0         // first 2

+    punpckhbw  xmm1, xmm1         // next 2

+    punpcklbw  xmm2, xmm5         // first 2

+    punpckhbw  xmm3, xmm5         // next 2

+    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2

+    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2

+    lea        eax, [eax + 16]

+    lea        esi, [esi + 16]

+    packuswb   xmm0, xmm1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBMULTIPLYROW_SSE2

+#ifdef HAS_ARGBADDROW_SSE2

+// Add 2 rows of ARGB pixels together, 4 pixels at a time.

+// TODO(fbarchard): Port this to posix, neon and other math functions.

+__declspec(naked) __declspec(align(16))

+void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                     uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    sub        ecx, 4

+    jl         convertloop49

+    align      4

+ convertloop4:

+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0

+    lea        eax, [eax + 16]

+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1

+    lea        esi, [esi + 16]

+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jge        convertloop4

+ convertloop49:

+    add        ecx, 4 - 1

+    jl         convertloop19

+ convertloop1:

+    movd       xmm0, [eax]        // read 1 pixels from src_argb0

+    lea        eax, [eax + 4]

+    movd       xmm1, [esi]        // read 1 pixels from src_argb1

+    lea        esi, [esi + 4]

+    paddusb    xmm0, xmm1         // src_argb0 + src_argb1

+    sub        ecx, 1

+    movd       [edx], xmm0

+    lea        edx, [edx + 4]

+    jge        convertloop1

+ convertloop19:

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBADDROW_SSE2

+#ifdef HAS_ARGBSUBTRACTROW_SSE2

+// Subtract 2 rows of ARGB pixels together, 4 pixels at a time.

+__declspec(naked) __declspec(align(16))

+void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    align      4

+ convertloop:

+    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0

+    lea        eax, [eax + 16]

+    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1

+    lea        esi, [esi + 16]

+    psubusb    xmm0, xmm1         // src_argb0 - src_argb1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBSUBTRACTROW_SSE2

+#ifdef HAS_ARGBMULTIPLYROW_AVX2

+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

+__declspec(naked) __declspec(align(16))

+void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    vpxor      ymm5, ymm5, ymm5     // constant 0

+    align      4

+ convertloop:

+    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0

+    lea        eax, [eax + 32]

+    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1

+    lea        esi, [esi + 32]

+    vpunpcklbw ymm0, ymm1, ymm1   // low 4

+    vpunpckhbw ymm1, ymm1, ymm1   // high 4

+    vpunpcklbw ymm2, ymm3, ymm5   // low 4

+    vpunpckhbw ymm3, ymm3, ymm5   // high 4

+    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4

+    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4

+    vpackuswb  ymm0, ymm0, ymm1

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBMULTIPLYROW_AVX2

+#ifdef HAS_ARGBADDROW_AVX2

+// Add 2 rows of ARGB pixels together, 8 pixels at a time.

+__declspec(naked) __declspec(align(16))

+void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

+                     uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    align      4

+ convertloop:

+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0

+    lea        eax, [eax + 32]

+    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1

+    lea        esi, [esi + 32]

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBADDROW_AVX2

+#ifdef HAS_ARGBSUBTRACTROW_AVX2

+// Subtract 2 rows of ARGB pixels together, 8 pixels at a time.

+__declspec(naked) __declspec(align(16))

+void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

+                          uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_argb0

+    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    align      4

+ convertloop:

+    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0

+    lea        eax, [eax + 32]

+    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1

+    lea        esi, [esi + 32]

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    sub        ecx, 8

+    jg         convertloop

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBSUBTRACTROW_AVX2

+#ifdef HAS_SOBELXROW_SSE2

+// SobelX as a matrix is

+// -1  0  1

+// -2  0  2

+// -1  0  1

+__declspec(naked) __declspec(align(16))

+void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,

+                    const uint8* src_y2, uint8* dst_sobelx, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   // src_y0

+    mov        esi, [esp + 8 + 8]   // src_y1

+    mov        edi, [esp + 8 + 12]  // src_y2

+    mov        edx, [esp + 8 + 16]  // dst_sobelx

+    mov        ecx, [esp + 8 + 20]  // width

+    sub        esi, eax

+    sub        edi, eax

+    sub        edx, eax

+    pxor       xmm5, xmm5  // constant 0

+    align      4

+ convertloop:

+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]

+    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]

+    punpcklbw  xmm0, xmm5

+    punpcklbw  xmm1, xmm5

+    psubw      xmm0, xmm1

+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]

+    movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]

+    punpcklbw  xmm1, xmm5

+    punpcklbw  xmm2, xmm5

+    psubw      xmm1, xmm2

+    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]

+    movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]

+    punpcklbw  xmm2, xmm5

+    punpcklbw  xmm3, xmm5

+    psubw      xmm2, xmm3

+    paddw      xmm0, xmm2

+    paddw      xmm0, xmm1

+    paddw      xmm0, xmm1

+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw

+    psubw      xmm1, xmm0

+    pmaxsw     xmm0, xmm1

+    packuswb   xmm0, xmm0

+    sub        ecx, 8

+    movq       qword ptr [eax + edx], xmm0

+    lea        eax, [eax + 8]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_SOBELXROW_SSE2

+#ifdef HAS_SOBELYROW_SSE2

+// SobelY as a matrix is

+// -1 -2 -1

+//  0  0  0

+//  1  2  1

+__declspec(naked) __declspec(align(16))

+void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,

+                    uint8* dst_sobely, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_y0

+    mov        esi, [esp + 4 + 8]   // src_y1

+    mov        edx, [esp + 4 + 12]  // dst_sobely

+    mov        ecx, [esp + 4 + 16]  // width

+    sub        esi, eax

+    sub        edx, eax

+    pxor       xmm5, xmm5  // constant 0

+    align      4

+ convertloop:

+    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]

+    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]

+    punpcklbw  xmm0, xmm5

+    punpcklbw  xmm1, xmm5

+    psubw      xmm0, xmm1

+    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]

+    movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]

+    punpcklbw  xmm1, xmm5

+    punpcklbw  xmm2, xmm5

+    psubw      xmm1, xmm2

+    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]

+    movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]

+    punpcklbw  xmm2, xmm5

+    punpcklbw  xmm3, xmm5

+    psubw      xmm2, xmm3

+    paddw      xmm0, xmm2

+    paddw      xmm0, xmm1

+    paddw      xmm0, xmm1

+    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw

+    psubw      xmm1, xmm0

+    pmaxsw     xmm0, xmm1

+    packuswb   xmm0, xmm0

+    sub        ecx, 8

+    movq       qword ptr [eax + edx], xmm0

+    lea        eax, [eax + 8]

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_SOBELYROW_SSE2

+#ifdef HAS_SOBELROW_SSE2

+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.

+// A = 255

+// R = Sobel

+// G = Sobel

+// B = Sobel

+__declspec(naked) __declspec(align(16))

+void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                   uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_sobelx

+    mov        esi, [esp + 4 + 8]   // src_sobely

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    sub        esi, eax

+    pcmpeqb    xmm5, xmm5           // alpha 255

+    pslld      xmm5, 24             // 0xff000000

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx

+    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely

+    lea        eax, [eax + 16]

+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely

+    movdqa     xmm2, xmm0             // GG

+    punpcklbw  xmm2, xmm0             // First 8

+    punpckhbw  xmm0, xmm0             // Next 8

+    movdqa     xmm1, xmm2             // GGGG

+    punpcklwd  xmm1, xmm2             // First 4

+    punpckhwd  xmm2, xmm2             // Next 4

+    por        xmm1, xmm5             // GGGA

+    por        xmm2, xmm5

+    movdqa     xmm3, xmm0             // GGGG

+    punpcklwd  xmm3, xmm0             // Next 4

+    punpckhwd  xmm0, xmm0             // Last 4

+    por        xmm3, xmm5             // GGGA

+    por        xmm0, xmm5

+    sub        ecx, 16

+    movdqa     [edx], xmm1

+    movdqa     [edx + 16], xmm2

+    movdqa     [edx + 32], xmm3

+    movdqa     [edx + 48], xmm0

+    lea        edx, [edx + 64]

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_SOBELROW_SSE2

+#ifdef HAS_SOBELTOPLANEROW_SSE2

+// Adds Sobel X and Sobel Y and stores Sobel into a plane.

+__declspec(naked) __declspec(align(16))

+void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                          uint8* dst_y, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_sobelx

+    mov        esi, [esp + 4 + 8]   // src_sobely

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    sub        esi, eax

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx

+    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely

+    lea        eax, [eax + 16]

+    paddusb    xmm0, xmm1             // sobel = sobelx + sobely

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_SOBELTOPLANEROW_SSE2

+#ifdef HAS_SOBELXYROW_SSE2

+// Mixes Sobel X, Sobel Y and Sobel into ARGB.

+// A = 255

+// R = Sobel X

+// G = Sobel

+// B = Sobel Y

+__declspec(naked) __declspec(align(16))

+void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

+                     uint8* dst_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   // src_sobelx

+    mov        esi, [esp + 4 + 8]   // src_sobely

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // width

+    sub        esi, eax

+    pcmpeqb    xmm5, xmm5           // alpha 255

+    align      4

+ convertloop:

+    movdqa     xmm0, [eax]            // read 16 pixels src_sobelx

+    movdqa     xmm1, [eax + esi]      // read 16 pixels src_sobely

+    lea        eax, [eax + 16]

+    movdqa     xmm2, xmm0

+    paddusb    xmm2, xmm1             // sobel = sobelx + sobely

+    movdqa     xmm3, xmm0             // XA

+    punpcklbw  xmm3, xmm5

+    punpckhbw  xmm0, xmm5

+    movdqa     xmm4, xmm1             // YS

+    punpcklbw  xmm4, xmm2

+    punpckhbw  xmm1, xmm2

+    movdqa     xmm6, xmm4             // YSXA

+    punpcklwd  xmm6, xmm3             // First 4

+    punpckhwd  xmm4, xmm3             // Next 4

+    movdqa     xmm7, xmm1             // YSXA

+    punpcklwd  xmm7, xmm0             // Next 4

+    punpckhwd  xmm1, xmm0             // Last 4

+    sub        ecx, 16

+    movdqa     [edx], xmm6

+    movdqa     [edx + 16], xmm4

+    movdqa     [edx + 32], xmm7

+    movdqa     [edx + 48], xmm1

+    lea        edx, [edx + 64]

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_SOBELXYROW_SSE2

+#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

+// Consider float CumulativeSum.

+// Consider calling CumulativeSum one row at time as needed.

+// Consider circular CumulativeSum buffer of radius * 2 + 1 height.

+// Convert cumulative sum for an area to an average for 1 pixel.

+// topleft is pointer to top left of CumulativeSum buffer for area.

+// botleft is pointer to bottom left of CumulativeSum buffer.

+// width is offset from left to right of area in CumulativeSum buffer measured

+//   in number of ints.

+// area is the number of pixels in the area being averaged.

+// dst points to pixel to store result to.

+// count is number of averaged pixels to produce.

+// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte

+// aligned.

+void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,

+                                    int width, int area, uint8* dst,

+                                    int count) {

+  __asm {

+    mov        eax, topleft  // eax topleft

+    mov        esi, botleft  // esi botleft

+    mov        edx, width

+    movd       xmm5, area

+    mov        edi, dst

+    mov        ecx, count

+    cvtdq2ps   xmm5, xmm5

+    rcpss      xmm4, xmm5  // 1.0f / area

+    pshufd     xmm4, xmm4, 0

+    sub        ecx, 4

+    jl         l4b

+    cmp        area, 128  // 128 pixels will not overflow 15 bits.

+    ja         l4

+    pshufd     xmm5, xmm5, 0        // area

+    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0

+    psrld      xmm6, 16

+    cvtdq2ps   xmm6, xmm6

+    addps      xmm5, xmm6           // (65536.0 + area - 1)

+    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area

+    cvtps2dq   xmm5, xmm5           // 0.16 fixed point

+    packssdw   xmm5, xmm5           // 16 bit shorts

+    // 4 pixel loop small blocks.

+    align      4

+  s4:

+    // top left

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    // - top right

+    psubd      xmm0, [eax + edx * 4]

+    psubd      xmm1, [eax + edx * 4 + 16]

+    psubd      xmm2, [eax + edx * 4 + 32]

+    psubd      xmm3, [eax + edx * 4 + 48]

+    lea        eax, [eax + 64]

+    // - bottom left

+    psubd      xmm0, [esi]

+    psubd      xmm1, [esi + 16]

+    psubd      xmm2, [esi + 32]

+    psubd      xmm3, [esi + 48]

+    // + bottom right

+    paddd      xmm0, [esi + edx * 4]

+    paddd      xmm1, [esi + edx * 4 + 16]

+    paddd      xmm2, [esi + edx * 4 + 32]

+    paddd      xmm3, [esi + edx * 4 + 48]

+    lea        esi, [esi + 64]

+    packssdw   xmm0, xmm1  // pack 4 pixels into 2 registers

+    packssdw   xmm2, xmm3

+    pmulhuw    xmm0, xmm5

+    pmulhuw    xmm2, xmm5

+    packuswb   xmm0, xmm2

+    movdqu     [edi], xmm0

+    lea        edi, [edi + 16]

+    sub        ecx, 4

+    jge        s4

+    jmp        l4b

+    // 4 pixel loop

+    align      4

+  l4:

+    // top left

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + 32]

+    movdqa     xmm3, [eax + 48]

+    // - top right

+    psubd      xmm0, [eax + edx * 4]

+    psubd      xmm1, [eax + edx * 4 + 16]

+    psubd      xmm2, [eax + edx * 4 + 32]

+    psubd      xmm3, [eax + edx * 4 + 48]

+    lea        eax, [eax + 64]

+    // - bottom left

+    psubd      xmm0, [esi]

+    psubd      xmm1, [esi + 16]

+    psubd      xmm2, [esi + 32]

+    psubd      xmm3, [esi + 48]

+    // + bottom right

+    paddd      xmm0, [esi + edx * 4]

+    paddd      xmm1, [esi + edx * 4 + 16]

+    paddd      xmm2, [esi + edx * 4 + 32]

+    paddd      xmm3, [esi + edx * 4 + 48]

+    lea        esi, [esi + 64]

+    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area

+    cvtdq2ps   xmm1, xmm1

+    mulps      xmm0, xmm4

+    mulps      xmm1, xmm4

+    cvtdq2ps   xmm2, xmm2

+    cvtdq2ps   xmm3, xmm3

+    mulps      xmm2, xmm4

+    mulps      xmm3, xmm4

+    cvtps2dq   xmm0, xmm0

+    cvtps2dq   xmm1, xmm1

+    cvtps2dq   xmm2, xmm2

+    cvtps2dq   xmm3, xmm3

+    packssdw   xmm0, xmm1

+    packssdw   xmm2, xmm3

+    packuswb   xmm0, xmm2

+    movdqu     [edi], xmm0

+    lea        edi, [edi + 16]

+    sub        ecx, 4

+    jge        l4

+  l4b:

+    add        ecx, 4 - 1

+    jl         l1b

+    // 1 pixel loop

+    align      4

+  l1:

+    movdqa     xmm0, [eax]

+    psubd      xmm0, [eax + edx * 4]

+    lea        eax, [eax + 16]

+    psubd      xmm0, [esi]

+    paddd      xmm0, [esi + edx * 4]

+    lea        esi, [esi + 16]

+    cvtdq2ps   xmm0, xmm0

+    mulps      xmm0, xmm4

+    cvtps2dq   xmm0, xmm0

+    packssdw   xmm0, xmm0

+    packuswb   xmm0, xmm0

+    movd       dword ptr [edi], xmm0

+    lea        edi, [edi + 4]

+    sub        ecx, 1

+    jge        l1

+  l1b:

+  }

+}

+#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

+#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2

+// Creates a table of cumulative sums where each value is a sum of all values

+// above and to the left of the value.

+void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,

+                                  const int32* previous_cumsum, int width) {

+  __asm {

+    mov        eax, row

+    mov        edx, cumsum

+    mov        esi, previous_cumsum

+    mov        ecx, width

+    pxor       xmm0, xmm0

+    pxor       xmm1, xmm1

+    sub        ecx, 4

+    jl         l4b

+    test       edx, 15

+    jne        l4b

+    // 4 pixel loop

+    align      4

+  l4:

+    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.

+    lea        eax, [eax + 16]

+    movdqa     xmm4, xmm2

+    punpcklbw  xmm2, xmm1

+    movdqa     xmm3, xmm2

+    punpcklwd  xmm2, xmm1

+    punpckhwd  xmm3, xmm1

+    punpckhbw  xmm4, xmm1

+    movdqa     xmm5, xmm4

+    punpcklwd  xmm4, xmm1

+    punpckhwd  xmm5, xmm1

+    paddd      xmm0, xmm2

+    movdqa     xmm2, [esi]  // previous row above.

+    paddd      xmm2, xmm0

+    paddd      xmm0, xmm3

+    movdqa     xmm3, [esi + 16]

+    paddd      xmm3, xmm0

+    paddd      xmm0, xmm4

+    movdqa     xmm4, [esi + 32]

+    paddd      xmm4, xmm0

+    paddd      xmm0, xmm5

+    movdqa     xmm5, [esi + 48]

+    lea        esi, [esi + 64]

+    paddd      xmm5, xmm0

+    movdqa     [edx], xmm2

+    movdqa     [edx + 16], xmm3

+    movdqa     [edx + 32], xmm4

+    movdqa     [edx + 48], xmm5

+    lea        edx, [edx + 64]

+    sub        ecx, 4

+    jge        l4

+  l4b:

+    add        ecx, 4 - 1

+    jl         l1b

+    // 1 pixel loop

+    align      4

+  l1:

+    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.

+    lea        eax, [eax + 4]

+    punpcklbw  xmm2, xmm1

+    punpcklwd  xmm2, xmm1

+    paddd      xmm0, xmm2

+    movdqu     xmm2, [esi]

+    lea        esi, [esi + 16]

+    paddd      xmm2, xmm0

+    movdqu     [edx], xmm2

+    lea        edx, [edx + 16]

+    sub        ecx, 1

+    jge        l1

+ l1b:

+  }

+}

+#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2

+#ifdef HAS_ARGBAFFINEROW_SSE2

+// Copy ARGB pixels from source image with slope to a row of destination.

+__declspec(naked) __declspec(align(16))

+LIBYUV_API

+void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

+                        uint8* dst_argb, const float* uv_dudv, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 12]  // src_argb

+    mov        esi, [esp + 16]  // stride

+    mov        edx, [esp + 20]  // dst_argb

+    mov        ecx, [esp + 24]  // pointer to uv_dudv

+    movq       xmm2, qword ptr [ecx]  // uv

+    movq       xmm7, qword ptr [ecx + 8]  // dudv

+    mov        ecx, [esp + 28]  // width

+    shl        esi, 16          // 4, stride

+    add        esi, 4

+    movd       xmm5, esi

+    sub        ecx, 4

+    jl         l4b

+    // setup for 4 pixel loop

+    pshufd     xmm7, xmm7, 0x44  // dup dudv

+    pshufd     xmm5, xmm5, 0  // dup 4, stride

+    movdqa     xmm0, xmm2    // x0, y0, x1, y1

+    addps      xmm0, xmm7

+    movlhps    xmm2, xmm0

+    movdqa     xmm4, xmm7

+    addps      xmm4, xmm4    // dudv *= 2

+    movdqa     xmm3, xmm2    // x2, y2, x3, y3

+    addps      xmm3, xmm4

+    addps      xmm4, xmm4    // dudv *= 4

+    // 4 pixel loop

+    align      4

+  l4:

+    cvttps2dq  xmm0, xmm2    // x, y float to int first 2

+    cvttps2dq  xmm1, xmm3    // x, y float to int next 2

+    packssdw   xmm0, xmm1    // x, y as 8 shorts

+    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.

+    movd       esi, xmm0

+    pshufd     xmm0, xmm0, 0x39  // shift right

+    movd       edi, xmm0

+    pshufd     xmm0, xmm0, 0x39  // shift right

+    movd       xmm1, [eax + esi]  // read pixel 0

+    movd       xmm6, [eax + edi]  // read pixel 1

+    punpckldq  xmm1, xmm6     // combine pixel 0 and 1

+    addps      xmm2, xmm4    // x, y += dx, dy first 2

+    movq       qword ptr [edx], xmm1

+    movd       esi, xmm0

+    pshufd     xmm0, xmm0, 0x39  // shift right

+    movd       edi, xmm0

+    movd       xmm6, [eax + esi]  // read pixel 2

+    movd       xmm0, [eax + edi]  // read pixel 3

+    punpckldq  xmm6, xmm0     // combine pixel 2 and 3

+    addps      xmm3, xmm4    // x, y += dx, dy next 2

+    sub        ecx, 4

+    movq       qword ptr 8[edx], xmm6

+    lea        edx, [edx + 16]

+    jge        l4

+  l4b:

+    add        ecx, 4 - 1

+    jl         l1b

+    // 1 pixel loop

+    align      4

+  l1:

+    cvttps2dq  xmm0, xmm2    // x, y float to int

+    packssdw   xmm0, xmm0    // x, y as shorts

+    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride

+    addps      xmm2, xmm7    // x, y += dx, dy

+    movd       esi, xmm0

+    movd       xmm0, [eax + esi]  // copy a pixel

+    sub        ecx, 1

+    movd       [edx], xmm0

+    lea        edx, [edx + 4]

+    jge        l1

+  l1b:

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBAFFINEROW_SSE2

+#ifdef HAS_INTERPOLATEROW_AVX2

+// Bilinear filter 16x2 -> 16x1

+__declspec(naked) __declspec(align(16))

+void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,

+                          ptrdiff_t src_stride, int dst_width,

+                          int source_y_fraction) {

+  __asm {

+    push       esi

+    push       edi

+    mov        edi, [esp + 8 + 4]   // dst_ptr

+    mov        esi, [esp + 8 + 8]   // src_ptr

+    mov        edx, [esp + 8 + 12]  // src_stride

+    mov        ecx, [esp + 8 + 16]  // dst_width

+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

+    shr        eax, 1

+    // Dispatch to specialized filters if applicable.

+    cmp        eax, 0

+    je         xloop100  // 0 / 128.  Blend 100 / 0.

+    sub        edi, esi

+    cmp        eax, 32

+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.

+    cmp        eax, 64

+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.

+    cmp        eax, 96

+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.

+    vmovd      xmm0, eax  // high fraction 0..127

+    neg        eax

+    add        eax, 128

+    vmovd      xmm5, eax  // low fraction 128..1

+    vpunpcklbw xmm5, xmm5, xmm0

+    vpunpcklwd xmm5, xmm5, xmm5

+    vpxor      ymm0, ymm0, ymm0

+    vpermd     ymm5, ymm0, ymm5

+    align      4

+  xloop:

+    vmovdqu    ymm0, [esi]

+    vmovdqu    ymm2, [esi + edx]

+    vpunpckhbw ymm1, ymm0, ymm2  // mutates

+    vpunpcklbw ymm0, ymm0, ymm2  // mutates

+    vpmaddubsw ymm0, ymm0, ymm5

+    vpmaddubsw ymm1, ymm1, ymm5

+    vpsrlw     ymm0, ymm0, 7

+    vpsrlw     ymm1, ymm1, 7

+    vpackuswb  ymm0, ymm0, ymm1  // unmutates

+    sub        ecx, 32

+    vmovdqu    [esi + edi], ymm0

+    lea        esi, [esi + 32]

+    jg         xloop

+    jmp        xloop99

+    // Blend 25 / 75.

+    align      4

+  xloop25:

+    vmovdqu    ymm0, [esi]

+    vpavgb     ymm0, ymm0, [esi + edx]

+    vpavgb     ymm0, ymm0, [esi + edx]

+    sub        ecx, 32

+    vmovdqu    [esi + edi], ymm0

+    lea        esi, [esi + 32]

+    jg         xloop25

+    jmp        xloop99

+    // Blend 50 / 50.

+    align      4

+  xloop50:

+    vmovdqu    ymm0, [esi]

+    vpavgb     ymm0, ymm0, [esi + edx]

+    sub        ecx, 32

+    vmovdqu    [esi + edi], ymm0

+    lea        esi, [esi + 32]

+    jg         xloop50

+    jmp        xloop99

+    // Blend 75 / 25.

+    align      4

+  xloop75:

+    vmovdqu    ymm0, [esi + edx]

+    vpavgb     ymm0, ymm0, [esi]

+    vpavgb     ymm0, ymm0, [esi]

+    sub        ecx, 32

+    vmovdqu     [esi + edi], ymm0

+    lea        esi, [esi + 32]

+    jg         xloop75

+    jmp        xloop99

+    // Blend 100 / 0 - Copy row unchanged.

+    align      4

+  xloop100:

+    rep movsb

+  xloop99:

+    pop        edi

+    pop        esi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_INTERPOLATEROW_AVX2

+#ifdef HAS_INTERPOLATEROW_SSSE3

+// Bilinear filter 16x2 -> 16x1

+__declspec(naked) __declspec(align(16))

+void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                          ptrdiff_t src_stride, int dst_width,

+                          int source_y_fraction) {

+  __asm {

+    push       esi

+    push       edi

+    mov        edi, [esp + 8 + 4]   // dst_ptr

+    mov        esi, [esp + 8 + 8]   // src_ptr

+    mov        edx, [esp + 8 + 12]  // src_stride

+    mov        ecx, [esp + 8 + 16]  // dst_width

+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

+    sub        edi, esi

+    shr        eax, 1

+    // Dispatch to specialized filters if applicable.

+    cmp        eax, 0

+    je         xloop100  // 0 / 128.  Blend 100 / 0.

+    cmp        eax, 32

+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.

+    cmp        eax, 64

+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.

+    cmp        eax, 96

+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.

+    movd       xmm0, eax  // high fraction 0..127

+    neg        eax

+    add        eax, 128

+    movd       xmm5, eax  // low fraction 128..1

+    punpcklbw  xmm5, xmm0

+    punpcklwd  xmm5, xmm5

+    pshufd     xmm5, xmm5, 0

+    align      4

+  xloop:

+    movdqa     xmm0, [esi]

+    movdqa     xmm2, [esi + edx]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm2

+    punpckhbw  xmm1, xmm2

+    pmaddubsw  xmm0, xmm5

+    pmaddubsw  xmm1, xmm5

+    psrlw      xmm0, 7

+    psrlw      xmm1, 7

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop

+    jmp        xloop99

+    // Blend 25 / 75.

+    align      4

+  xloop25:

+    movdqa     xmm0, [esi]

+    movdqa     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop25

+    jmp        xloop99

+    // Blend 50 / 50.

+    align      4

+  xloop50:

+    movdqa     xmm0, [esi]

+    movdqa     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop50

+    jmp        xloop99

+    // Blend 75 / 25.

+    align      4

+  xloop75:

+    movdqa     xmm1, [esi]

+    movdqa     xmm0, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop75

+    jmp        xloop99

+    // Blend 100 / 0 - Copy row unchanged.

+    align      4

+  xloop100:

+    movdqa     xmm0, [esi]

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop100

+  xloop99:

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_INTERPOLATEROW_SSSE3

+#ifdef HAS_INTERPOLATEROW_SSE2

+// Bilinear filter 16x2 -> 16x1

+__declspec(naked) __declspec(align(16))

+void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) {

+  __asm {

+    push       esi

+    push       edi

+    mov        edi, [esp + 8 + 4]   // dst_ptr

+    mov        esi, [esp + 8 + 8]   // src_ptr

+    mov        edx, [esp + 8 + 12]  // src_stride

+    mov        ecx, [esp + 8 + 16]  // dst_width

+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

+    sub        edi, esi

+    // Dispatch to specialized filters if applicable.

+    cmp        eax, 0

+    je         xloop100  // 0 / 256.  Blend 100 / 0.

+    cmp        eax, 64

+    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.

+    cmp        eax, 128

+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.

+    cmp        eax, 192

+    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.

+    movd       xmm5, eax            // xmm5 = y fraction

+    punpcklbw  xmm5, xmm5

+    psrlw      xmm5, 1

+    punpcklwd  xmm5, xmm5

+    punpckldq  xmm5, xmm5

+    punpcklqdq xmm5, xmm5

+    pxor       xmm4, xmm4

+    align      4

+  xloop:

+    movdqa     xmm0, [esi]  // row0

+    movdqa     xmm2, [esi + edx]  // row1

+    movdqa     xmm1, xmm0

+    movdqa     xmm3, xmm2

+    punpcklbw  xmm2, xmm4

+    punpckhbw  xmm3, xmm4

+    punpcklbw  xmm0, xmm4

+    punpckhbw  xmm1, xmm4

+    psubw      xmm2, xmm0  // row1 - row0

+    psubw      xmm3, xmm1

+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16

+    paddw      xmm3, xmm3

+    pmulhw     xmm2, xmm5  // scale diff

+    pmulhw     xmm3, xmm5

+    paddw      xmm0, xmm2  // sum rows

+    paddw      xmm1, xmm3

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop

+    jmp        xloop99

+    // Blend 25 / 75.

+    align      4

+  xloop25:

+    movdqa     xmm0, [esi]

+    movdqa     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop25

+    jmp        xloop99

+    // Blend 50 / 50.

+    align      4

+  xloop50:

+    movdqa     xmm0, [esi]

+    movdqa     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop50

+    jmp        xloop99

+    // Blend 75 / 25.

+    align      4

+  xloop75:

+    movdqa     xmm1, [esi]

+    movdqa     xmm0, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop75

+    jmp        xloop99

+    // Blend 100 / 0 - Copy row unchanged.

+    align      4

+  xloop100:

+    movdqa     xmm0, [esi]

+    sub        ecx, 16

+    movdqa     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop100

+  xloop99:

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_INTERPOLATEROW_SSE2

+// Bilinear filter 16x2 -> 16x1

+__declspec(naked) __declspec(align(16))

+void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                                    ptrdiff_t src_stride, int dst_width,

+                                    int source_y_fraction) {

+  __asm {

+    push       esi

+    push       edi

+    mov        edi, [esp + 8 + 4]   // dst_ptr

+    mov        esi, [esp + 8 + 8]   // src_ptr

+    mov        edx, [esp + 8 + 12]  // src_stride

+    mov        ecx, [esp + 8 + 16]  // dst_width

+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

+    sub        edi, esi

+    shr        eax, 1

+    // Dispatch to specialized filters if applicable.

+    cmp        eax, 0

+    je         xloop100  // 0 / 128.  Blend 100 / 0.

+    cmp        eax, 32

+    je         xloop75   // 32 / 128 is 0.25.  Blend 75 / 25.

+    cmp        eax, 64

+    je         xloop50   // 64 / 128 is 0.50.  Blend 50 / 50.

+    cmp        eax, 96

+    je         xloop25   // 96 / 128 is 0.75.  Blend 25 / 75.

+    movd       xmm0, eax  // high fraction 0..127

+    neg        eax

+    add        eax, 128

+    movd       xmm5, eax  // low fraction 128..1

+    punpcklbw  xmm5, xmm0

+    punpcklwd  xmm5, xmm5

+    pshufd     xmm5, xmm5, 0

+    align      4

+  xloop:

+    movdqu     xmm0, [esi]

+    movdqu     xmm2, [esi + edx]

+    movdqu     xmm1, xmm0

+    punpcklbw  xmm0, xmm2

+    punpckhbw  xmm1, xmm2

+    pmaddubsw  xmm0, xmm5

+    pmaddubsw  xmm1, xmm5

+    psrlw      xmm0, 7

+    psrlw      xmm1, 7

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop

+    jmp        xloop99

+    // Blend 25 / 75.

+    align      4

+  xloop25:

+    movdqu     xmm0, [esi]

+    movdqu     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop25

+    jmp        xloop99

+    // Blend 50 / 50.

+    align      4

+  xloop50:

+    movdqu     xmm0, [esi]

+    movdqu     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop50

+    jmp        xloop99

+    // Blend 75 / 25.

+    align      4

+  xloop75:

+    movdqu     xmm1, [esi]

+    movdqu     xmm0, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop75

+    jmp        xloop99

+    // Blend 100 / 0 - Copy row unchanged.

+    align      4

+  xloop100:

+    movdqu     xmm0, [esi]

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop100

+  xloop99:

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#ifdef HAS_INTERPOLATEROW_SSE2

+// Bilinear filter 16x2 -> 16x1

+__declspec(naked) __declspec(align(16))

+void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                                   ptrdiff_t src_stride, int dst_width,

+                                   int source_y_fraction) {

+  __asm {

+    push       esi

+    push       edi

+    mov        edi, [esp + 8 + 4]   // dst_ptr

+    mov        esi, [esp + 8 + 8]   // src_ptr

+    mov        edx, [esp + 8 + 12]  // src_stride

+    mov        ecx, [esp + 8 + 16]  // dst_width

+    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

+    sub        edi, esi

+    // Dispatch to specialized filters if applicable.

+    cmp        eax, 0

+    je         xloop100  // 0 / 256.  Blend 100 / 0.

+    cmp        eax, 64

+    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.

+    cmp        eax, 128

+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.

+    cmp        eax, 192

+    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.

+    movd       xmm5, eax            // xmm5 = y fraction

+    punpcklbw  xmm5, xmm5

+    psrlw      xmm5, 1

+    punpcklwd  xmm5, xmm5

+    punpckldq  xmm5, xmm5

+    punpcklqdq xmm5, xmm5

+    pxor       xmm4, xmm4

+    align      4

+  xloop:

+    movdqu     xmm0, [esi]  // row0

+    movdqu     xmm2, [esi + edx]  // row1

+    movdqu     xmm1, xmm0

+    movdqu     xmm3, xmm2

+    punpcklbw  xmm2, xmm4

+    punpckhbw  xmm3, xmm4

+    punpcklbw  xmm0, xmm4

+    punpckhbw  xmm1, xmm4

+    psubw      xmm2, xmm0  // row1 - row0

+    psubw      xmm3, xmm1

+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16

+    paddw      xmm3, xmm3

+    pmulhw     xmm2, xmm5  // scale diff

+    pmulhw     xmm3, xmm5

+    paddw      xmm0, xmm2  // sum rows

+    paddw      xmm1, xmm3

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop

+    jmp        xloop99

+    // Blend 25 / 75.

+    align      4

+  xloop25:

+    movdqu     xmm0, [esi]

+    movdqu     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop25

+    jmp        xloop99

+    // Blend 50 / 50.

+    align      4

+  xloop50:

+    movdqu     xmm0, [esi]

+    movdqu     xmm1, [esi + edx]

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop50

+    jmp        xloop99

+    // Blend 75 / 25.

+    align      4

+  xloop75:

+    movdqu     xmm1, [esi]

+    movdqu     xmm0, [esi + edx]

+    pavgb      xmm0, xmm1

+    pavgb      xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop75

+    jmp        xloop99

+    // Blend 100 / 0 - Copy row unchanged.

+    align      4

+  xloop100:

+    movdqu     xmm0, [esi]

+    sub        ecx, 16

+    movdqu     [esi + edi], xmm0

+    lea        esi, [esi + 16]

+    jg         xloop100

+  xloop99:

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_INTERPOLATEROW_SSE2

+__declspec(naked) __declspec(align(16))

+void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_uv

+    mov        edx, [esp + 4 + 8]    // src_uv_stride

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    sub        edi, eax

+    align      4

+  convertloop:

+    movdqa     xmm0, [eax]

+    pavgb      xmm0, [eax + edx]

+    sub        ecx, 16

+    movdqa     [eax + edi], xmm0

+    lea        eax,  [eax + 16]

+    jg         convertloop

+    pop        edi

+    ret

+  }

+}

+#ifdef HAS_HALFROW_AVX2

+__declspec(naked) __declspec(align(16))

+void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,

+                  uint8* dst_uv, int pix) {

+  __asm {

+    push       edi

+    mov        eax, [esp + 4 + 4]    // src_uv

+    mov        edx, [esp + 4 + 8]    // src_uv_stride

+    mov        edi, [esp + 4 + 12]   // dst_v

+    mov        ecx, [esp + 4 + 16]   // pix

+    sub        edi, eax

+    align      4

+  convertloop:

+    vmovdqu    ymm0, [eax]

+    vpavgb     ymm0, ymm0, [eax + edx]

+    sub        ecx, 32

+    vmovdqu    [eax + edi], ymm0

+    lea        eax,  [eax + 32]

+    jg         convertloop

+    pop        edi

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_HALFROW_AVX2

+__declspec(naked) __declspec(align(16))

+void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,

+                          uint32 selector, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_argb

+    mov        edx, [esp + 8]    // dst_bayer

+    movd       xmm5, [esp + 12]  // selector

+    mov        ecx, [esp + 16]   // pix

+    pshufd     xmm5, xmm5, 0

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax, [eax + 32]

+    pshufb     xmm0, xmm5

+    pshufb     xmm1, xmm5

+    punpckldq  xmm0, xmm1

+    sub        ecx, 8

+    movq       qword ptr [edx], xmm0

+    lea        edx, [edx + 8]

+    jg         wloop

+    ret

+  }

+}

+// Specialized ARGB to Bayer that just isolates G channel.

+__declspec(naked) __declspec(align(16))

+void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,

+                           uint32 selector, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_argb

+    mov        edx, [esp + 8]    // dst_bayer

+                                 // selector

+    mov        ecx, [esp + 16]   // pix

+    pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff

+    psrld      xmm5, 24

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax, [eax + 32]

+    psrld      xmm0, 8  // Move green to bottom.

+    psrld      xmm1, 8

+    pand       xmm0, xmm5

+    pand       xmm1, xmm5

+    packssdw   xmm0, xmm1

+    packuswb   xmm0, xmm1

+    sub        ecx, 8

+    movq       qword ptr [edx], xmm0

+    lea        edx, [edx + 8]

+    jg         wloop

+    ret

+  }

+}

+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

+__declspec(naked) __declspec(align(16))

+void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                          const uint8* shuffler, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_argb

+    mov        edx, [esp + 8]    // dst_argb

+    mov        ecx, [esp + 12]   // shuffler

+    movdqa     xmm5, [ecx]

+    mov        ecx, [esp + 16]   // pix

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax, [eax + 32]

+    pshufb     xmm0, xmm5

+    pshufb     xmm1, xmm5

+    sub        ecx, 8

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    jg         wloop

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                                    const uint8* shuffler, int pix) {

+  __asm {

+    mov        eax, [esp + 4]    // src_argb

+    mov        edx, [esp + 8]    // dst_argb

+    mov        ecx, [esp + 12]   // shuffler

+    movdqa     xmm5, [ecx]

+    mov        ecx, [esp + 16]   // pix

+    align      4

+  wloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax, [eax + 32]

+    pshufb     xmm0, xmm5

+    pshufb     xmm1, xmm5

+    sub        ecx, 8

+    movdqu     [edx], xmm0

+    movdqu     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    jg         wloop

+    ret

+  }

+}

+#ifdef HAS_ARGBSHUFFLEROW_AVX2

+__declspec(naked) __declspec(align(16))

+void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix) {

+  __asm {

+    mov        eax, [esp + 4]     // src_argb

+    mov        edx, [esp + 8]     // dst_argb

+    mov        ecx, [esp + 12]    // shuffler

+    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.

+    mov        ecx, [esp + 16]    // pix

+    align      4

+  wloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    lea        eax, [eax + 64]

+    vpshufb    ymm0, ymm0, ymm5

+    vpshufb    ymm1, ymm1, ymm5

+    sub        ecx, 16

+    vmovdqu    [edx], ymm0

+    vmovdqu    [edx + 32], ymm1

+    lea        edx, [edx + 64]

+    jg         wloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBSHUFFLEROW_AVX2

+__declspec(naked) __declspec(align(16))

+void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

+                         const uint8* shuffler, int pix) {

+  __asm {

+    push       ebx

+    push       esi

+    mov        eax, [esp + 8 + 4]    // src_argb

+    mov        edx, [esp + 8 + 8]    // dst_argb

+    mov        esi, [esp + 8 + 12]   // shuffler

+    mov        ecx, [esp + 8 + 16]   // pix

+    pxor       xmm5, xmm5

+    mov        ebx, [esi]   // shuffler

+    cmp        ebx, 0x03000102

+    je         shuf_3012

+    cmp        ebx, 0x00010203

+    je         shuf_0123

+    cmp        ebx, 0x00030201

+    je         shuf_0321

+    cmp        ebx, 0x02010003

+    je         shuf_2103

+  // TODO(fbarchard): Use one source pointer and 3 offsets.

+  shuf_any1:

+    movzx      ebx, byte ptr [esi]

+    movzx      ebx, byte ptr [eax + ebx]

+    mov        [edx], bl

+    movzx      ebx, byte ptr [esi + 1]

+    movzx      ebx, byte ptr [eax + ebx]

+    mov        [edx + 1], bl

+    movzx      ebx, byte ptr [esi + 2]

+    movzx      ebx, byte ptr [eax + ebx]

+    mov        [edx + 2], bl

+    movzx      ebx, byte ptr [esi + 3]

+    movzx      ebx, byte ptr [eax + ebx]

+    mov        [edx + 3], bl

+    lea        eax, [eax + 4]

+    lea        edx, [edx + 4]

+    sub        ecx, 1

+    jg         shuf_any1

+    jmp        shuf99

+    align      4

+  shuf_0123:

+    movdqu     xmm0, [eax]

+    lea        eax, [eax + 16]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm5

+    punpckhbw  xmm1, xmm5

+    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB

+    pshuflw    xmm0, xmm0, 01Bh

+    pshufhw    xmm1, xmm1, 01Bh

+    pshuflw    xmm1, xmm1, 01Bh

+    packuswb   xmm0, xmm1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         shuf_0123

+    jmp        shuf99

+    align      4

+  shuf_0321:

+    movdqu     xmm0, [eax]

+    lea        eax, [eax + 16]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm5

+    punpckhbw  xmm1, xmm5

+    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB

+    pshuflw    xmm0, xmm0, 039h

+    pshufhw    xmm1, xmm1, 039h

+    pshuflw    xmm1, xmm1, 039h

+    packuswb   xmm0, xmm1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         shuf_0321

+    jmp        shuf99

+    align      4

+  shuf_2103:

+    movdqu     xmm0, [eax]

+    lea        eax, [eax + 16]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm5

+    punpckhbw  xmm1, xmm5

+    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA

+    pshuflw    xmm0, xmm0, 093h

+    pshufhw    xmm1, xmm1, 093h

+    pshuflw    xmm1, xmm1, 093h

+    packuswb   xmm0, xmm1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         shuf_2103

+    jmp        shuf99

+    align      4

+  shuf_3012:

+    movdqu     xmm0, [eax]

+    lea        eax, [eax + 16]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm5

+    punpckhbw  xmm1, xmm5

+    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB

+    pshuflw    xmm0, xmm0, 0C6h

+    pshufhw    xmm1, xmm1, 0C6h

+    pshuflw    xmm1, xmm1, 0C6h

+    packuswb   xmm0, xmm1

+    sub        ecx, 4

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         shuf_3012

+  shuf99:

+    pop        esi

+    pop        ebx

+    ret

+  }

+}

+// YUY2 - Macro-pixel = 2 image pixels

+// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....

+// UYVY - Macro-pixel = 2 image pixels

+// U0Y0V0Y1

+__declspec(naked) __declspec(align(16))

+void I422ToYUY2Row_SSE2(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_frame, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_y

+    mov        esi, [esp + 8 + 8]    // src_u

+    mov        edx, [esp + 8 + 12]   // src_v

+    mov        edi, [esp + 8 + 16]   // dst_frame

+    mov        ecx, [esp + 8 + 20]   // width

+    sub        edx, esi

+    align      4

+  convertloop:

+    movq       xmm2, qword ptr [esi] // U

+    movq       xmm3, qword ptr [esi + edx] // V

+    lea        esi, [esi + 8]

+    punpcklbw  xmm2, xmm3 // UV

+    movdqu     xmm0, [eax] // Y

+    lea        eax, [eax + 16]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm2 // YUYV

+    punpckhbw  xmm1, xmm2

+    movdqu     [edi], xmm0

+    movdqu     [edi + 16], xmm1

+    lea        edi, [edi + 32]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+__declspec(naked) __declspec(align(16))

+void I422ToUYVYRow_SSE2(const uint8* src_y,

+                        const uint8* src_u,

+                        const uint8* src_v,

+                        uint8* dst_frame, int width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_y

+    mov        esi, [esp + 8 + 8]    // src_u

+    mov        edx, [esp + 8 + 12]   // src_v

+    mov        edi, [esp + 8 + 16]   // dst_frame

+    mov        ecx, [esp + 8 + 20]   // width

+    sub        edx, esi

+    align      4

+  convertloop:

+    movq       xmm2, qword ptr [esi] // U

+    movq       xmm3, qword ptr [esi + edx] // V

+    lea        esi, [esi + 8]

+    punpcklbw  xmm2, xmm3 // UV

+    movdqu     xmm0, [eax] // Y

+    movdqa     xmm1, xmm2

+    lea        eax, [eax + 16]

+    punpcklbw  xmm1, xmm0 // UYVY

+    punpckhbw  xmm2, xmm0

+    movdqu     [edi], xmm1

+    movdqu     [edi + 16], xmm2

+    lea        edi, [edi + 32]

+    sub        ecx, 16

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#ifdef HAS_ARGBPOLYNOMIALROW_SSE2

+__declspec(naked) __declspec(align(16))

+void ARGBPolynomialRow_SSE2(const uint8* src_argb,

+                            uint8* dst_argb, const float* poly,

+                            int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   /* src_argb */

+    mov        edx, [esp + 4 + 8]   /* dst_argb */

+    mov        esi, [esp + 4 + 12]  /* poly */

+    mov        ecx, [esp + 4 + 16]  /* width */

+    pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.

+    // 2 pixel loop.

+    align      4

+ convertloop:

+//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel

+//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel

+    movq       xmm0, qword ptr [eax]  // BGRABGRA

+    lea        eax, [eax + 8]

+    punpcklbw  xmm0, xmm3

+    movdqa     xmm4, xmm0

+    punpcklwd  xmm0, xmm3  // pixel 0

+    punpckhwd  xmm4, xmm3  // pixel 1

+    cvtdq2ps   xmm0, xmm0  // 4 floats

+    cvtdq2ps   xmm4, xmm4

+    movdqa     xmm1, xmm0  // X

+    movdqa     xmm5, xmm4

+    mulps      xmm0, [esi + 16]  // C1 * X

+    mulps      xmm4, [esi + 16]

+    addps      xmm0, [esi]  // result = C0 + C1 * X

+    addps      xmm4, [esi]

+    movdqa     xmm2, xmm1

+    movdqa     xmm6, xmm5

+    mulps      xmm2, xmm1  // X * X

+    mulps      xmm6, xmm5

+    mulps      xmm1, xmm2  // X * X * X

+    mulps      xmm5, xmm6

+    mulps      xmm2, [esi + 32]  // C2 * X * X

+    mulps      xmm6, [esi + 32]

+    mulps      xmm1, [esi + 48]  // C3 * X * X * X

+    mulps      xmm5, [esi + 48]

+    addps      xmm0, xmm2  // result += C2 * X * X

+    addps      xmm4, xmm6

+    addps      xmm0, xmm1  // result += C3 * X * X * X

+    addps      xmm4, xmm5

+    cvttps2dq  xmm0, xmm0

+    cvttps2dq  xmm4, xmm4

+    packuswb   xmm0, xmm4

+    packuswb   xmm0, xmm0

+    sub        ecx, 2

+    movq       qword ptr [edx], xmm0

+    lea        edx, [edx + 8]

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBPOLYNOMIALROW_SSE2

+#ifdef HAS_ARGBPOLYNOMIALROW_AVX2

+__declspec(naked) __declspec(align(16))

+void ARGBPolynomialRow_AVX2(const uint8* src_argb,

+                            uint8* dst_argb, const float* poly,

+                            int width) {

+  __asm {

+    mov        eax, [esp + 4]   /* src_argb */

+    mov        edx, [esp + 8]   /* dst_argb */

+    mov        ecx, [esp + 12]   /* poly */

+    vbroadcastf128 ymm4, [ecx]       // C0

+    vbroadcastf128 ymm5, [ecx + 16]  // C1

+    vbroadcastf128 ymm6, [ecx + 32]  // C2

+    vbroadcastf128 ymm7, [ecx + 48]  // C3

+    mov        ecx, [esp + 16]  /* width */

+    // 2 pixel loop.

+    align      4

+ convertloop:

+    vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels

+    lea         eax, [eax + 8]

+    vcvtdq2ps   ymm0, ymm0        // X 8 floats

+    vmulps      ymm2, ymm0, ymm0  // X * X

+    vmulps      ymm3, ymm0, ymm7  // C3 * X

+    vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X

+    vfmadd231ps ymm0, ymm2, ymm6  // result += C2 * X * X

+    vfmadd231ps ymm0, ymm2, ymm3  // result += C3 * X * X * X

+    vcvttps2dq  ymm0, ymm0

+    vpackusdw   ymm0, ymm0, ymm0  // b0g0r0a0_00000000_b0g0r0a0_00000000

+    vpermq      ymm0, ymm0, 0xd8  // b0g0r0a0_b0g0r0a0_00000000_00000000

+    vpackuswb   xmm0, xmm0, xmm0  // bgrabgra_00000000_00000000_00000000

+    sub         ecx, 2

+    vmovq       qword ptr [edx], xmm0

+    lea         edx, [edx + 8]

+    jg          convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBPOLYNOMIALROW_AVX2

+#ifdef HAS_ARGBCOLORTABLEROW_X86

+// Tranform ARGB pixels with color table.

+__declspec(naked) __declspec(align(16))

+void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

+                           int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   /* dst_argb */

+    mov        esi, [esp + 4 + 8]   /* table_argb */

+    mov        ecx, [esp + 4 + 12]  /* width */

+    // 1 pixel loop.

+    align      4

+  convertloop:

+    movzx      edx, byte ptr [eax]

+    lea        eax, [eax + 4]

+    movzx      edx, byte ptr [esi + edx * 4]

+    mov        byte ptr [eax - 4], dl

+    movzx      edx, byte ptr [eax - 4 + 1]

+    movzx      edx, byte ptr [esi + edx * 4 + 1]

+    mov        byte ptr [eax - 4 + 1], dl

+    movzx      edx, byte ptr [eax - 4 + 2]

+    movzx      edx, byte ptr [esi + edx * 4 + 2]

+    mov        byte ptr [eax - 4 + 2], dl

+    movzx      edx, byte ptr [eax - 4 + 3]

+    movzx      edx, byte ptr [esi + edx * 4 + 3]

+    mov        byte ptr [eax - 4 + 3], dl

+    dec        ecx

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBCOLORTABLEROW_X86

+#ifdef HAS_RGBCOLORTABLEROW_X86

+// Tranform RGB pixels with color table.

+__declspec(naked) __declspec(align(16))

+void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]   /* dst_argb */

+    mov        esi, [esp + 4 + 8]   /* table_argb */

+    mov        ecx, [esp + 4 + 12]  /* width */

+    // 1 pixel loop.

+    align      4

+  convertloop:

+    movzx      edx, byte ptr [eax]

+    lea        eax, [eax + 4]

+    movzx      edx, byte ptr [esi + edx * 4]

+    mov        byte ptr [eax - 4], dl

+    movzx      edx, byte ptr [eax - 4 + 1]

+    movzx      edx, byte ptr [esi + edx * 4 + 1]

+    mov        byte ptr [eax - 4 + 1], dl

+    movzx      edx, byte ptr [eax - 4 + 2]

+    movzx      edx, byte ptr [esi + edx * 4 + 2]

+    mov        byte ptr [eax - 4 + 2], dl

+    dec        ecx

+    jg         convertloop

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_RGBCOLORTABLEROW_X86

+#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3

+// Tranform RGB pixels with luma table.

+__declspec(naked) __declspec(align(16))

+void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+                                 int width,

+                                 const uint8* luma, uint32 lumacoeff) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]   /* src_argb */

+    mov        edi, [esp + 8 + 8]   /* dst_argb */

+    mov        ecx, [esp + 8 + 12]  /* width */

+    movd       xmm2, dword ptr [esp + 8 + 16]  // luma table

+    movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff

+    pshufd     xmm2, xmm2, 0

+    pshufd     xmm3, xmm3, 0

+    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00

+    psllw      xmm4, 8

+    pxor       xmm5, xmm5

+    // 4 pixel loop.

+    align      4

+  convertloop:

+    movdqu     xmm0, qword ptr [eax]      // generate luma ptr

+    pmaddubsw  xmm0, xmm3

+    phaddw     xmm0, xmm0

+    pand       xmm0, xmm4  // mask out low bits

+    punpcklwd  xmm0, xmm5

+    paddd      xmm0, xmm2  // add table base

+    movd       esi, xmm0

+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32

+    movzx      edx, byte ptr [eax]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi], dl

+    movzx      edx, byte ptr [eax + 1]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 1], dl

+    movzx      edx, byte ptr [eax + 2]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 2], dl

+    movzx      edx, byte ptr [eax + 3]  // copy alpha.

+    mov        byte ptr [edi + 3], dl

+    movd       esi, xmm0

+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32

+    movzx      edx, byte ptr [eax + 4]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 4], dl

+    movzx      edx, byte ptr [eax + 5]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 5], dl

+    movzx      edx, byte ptr [eax + 6]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 6], dl

+    movzx      edx, byte ptr [eax + 7]  // copy alpha.

+    mov        byte ptr [edi + 7], dl

+    movd       esi, xmm0

+    pshufd     xmm0, xmm0, 0x39  // 00111001 to rotate right 32

+    movzx      edx, byte ptr [eax + 8]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 8], dl

+    movzx      edx, byte ptr [eax + 9]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 9], dl

+    movzx      edx, byte ptr [eax + 10]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 10], dl

+    movzx      edx, byte ptr [eax + 11]  // copy alpha.

+    mov        byte ptr [edi + 11], dl

+    movd       esi, xmm0

+    movzx      edx, byte ptr [eax + 12]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 12], dl

+    movzx      edx, byte ptr [eax + 13]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 13], dl

+    movzx      edx, byte ptr [eax + 14]

+    movzx      edx, byte ptr [esi + edx]

+    mov        byte ptr [edi + 14], dl

+    movzx      edx, byte ptr [eax + 15]  // copy alpha.

+    mov        byte ptr [edi + 15], dl

+    sub        ecx, 4

+    lea        eax, [eax + 16]

+    lea        edi, [edi + 16]

+    jg         convertloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3

+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/third_party/libyuv/source/scale.c

+++ /dev/null

@@ -1,3884 +1,0 @@

-/*

- *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "third_party/libyuv/include/libyuv/scale.h"

-#include <assert.h>

-#include <string.h>

-#include "third_party/libyuv/include/libyuv/cpu_id.h"

-#include "third_party/libyuv/source/row.h"

-#ifdef __cplusplus

-namespace libyuv {

-extern "C" {

-#endif

-/*

- * Note: Defining YUV_DISABLE_ASM allows to use c version.

- */

-//#define YUV_DISABLE_ASM

-#if defined(_MSC_VER)

-#define ALIGN16(var) __declspec(align(16)) var

-#else

-#define ALIGN16(var) var __attribute__((aligned(16)))

-#endif

-// Note: A Neon reference manual

-// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html

-// Note: Some SSE2 reference manuals

-// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf

-// Set the following flag to true to revert to only

-// using the reference implementation ScalePlaneBox(), and

-// NOT the optimized versions. Useful for debugging and

-// when comparing the quality of the resulting YUV planes

-// as produced by the optimized and non-optimized versions.

-static int use_reference_impl_ = 0;

-void SetUseReferenceImpl(int use) {

-  use_reference_impl_ = use;

-}

-// ScaleRowDown2Int also used by planar functions

-/**

- * NEON downscalers with interpolation.

- *

- * Provided by Fritz Koenig

- *

- */

-#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)

-#define HAS_SCALEROWDOWN2_NEON

-void ScaleRowDown2_NEON(const uint8* src_ptr, int  src_stride,

-                        uint8* dst, int dst_width) {

-  asm volatile (

-    "1:                                        \n"

-    "vld2.u8    {q0,q1}, [%0]!                 \n"  // load even pixels into q0, odd into q1

-    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels

-    "subs       %2, %2, #16                    \n"  // 16 processed per loop

-    "bhi        1b                             \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst),              // %1

-      "+r"(dst_width)         // %2

-    :

-    : "q0", "q1"              // Clobber List

-  );

-}

-void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,

-                           uint8* dst, int dst_width) {

-  asm volatile (

-    "add        %1, %0                         \n"  // change the stride to row 2 pointer

-    "1:                                        \n"

-    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post increment

-    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post increment

-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent

-    "vpaddl.u8  q1, q1                         \n"

-    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent, add row 1 to row 2

-    "vpadal.u8  q1, q3                         \n"

-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack

-    "vrshrn.u16 d1, q1, #2                     \n"

-    "vst1.u8    {q0}, [%2]!                    \n"

-    "subs       %3, %3, #16                    \n"  // 16 processed per loop

-    "bhi        1b                             \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(src_stride),       // %1

-      "+r"(dst),              // %2

-      "+r"(dst_width)         // %3

-    :

-    : "q0", "q1", "q2", "q3"     // Clobber List

-   );

-}

-#define HAS_SCALEROWDOWN4_NEON

-static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "1:                                        \n"

-    "vld2.u8    {d0, d1}, [%0]!                \n"

-    "vtrn.u8    d1, d0                         \n"

-    "vshrn.u16  d0, q0, #8                     \n"

-    "vst1.u32   {d0[1]}, [%1]!                 \n"

-    "subs       %2, #4                         \n"

-    "bhi        1b                             \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width)         // %2

-    :

-    : "q0", "q1", "memory", "cc"

-  );

-}

-static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,

-                                  uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "add        r4, %0, %3                     \n"

-    "add        r5, r4, %3                     \n"

-    "add        %3, r5, %3                     \n"

-    "1:                                        \n"

-    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4 block of input data

-    "vld1.u8    {q1}, [r4]!                    \n"

-    "vld1.u8    {q2}, [r5]!                    \n"

-    "vld1.u8    {q3}, [%3]!                    \n"

-    "vpaddl.u8  q0, q0                         \n"

-    "vpadal.u8  q0, q1                         \n"

-    "vpadal.u8  q0, q2                         \n"

-    "vpadal.u8  q0, q3                         \n"

-    "vpaddl.u16 q0, q0                         \n"

-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding

-    "vmovn.u16  d0, q0                         \n"

-    "vst1.u32   {d0[0]}, [%1]!                 \n"

-    "subs       %2, #4                         \n"

-    "bhi        1b                             \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width)         // %2

-    : "r"(src_stride)         // %3

-    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"

-  );

-}

-#define HAS_SCALEROWDOWN34_NEON

-// Down scale from 4 to 3 pixels.  Use the neon multilane read/write

-//  to load up the every 4th pixel into a 4 different registers.

-// Point samples 32 pixels to 24 pixels.

-static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "1:                                        \n"

-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0

-    "vmov         d2, d3                       \n" // order needs to be d0, d1, d2

-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"

-    "subs         %2, #24                      \n"

-    "bhi          1b                           \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width)         // %2

-    :

-    : "d0", "d1", "d2", "d3", "memory", "cc"

-  );

-}

-static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vmov.u8      d24, #3                      \n"

-    "add          %3, %0                       \n"

-    "1:                                        \n"

-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0

-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1

-    // filter src line 0 with src line 1

-    // expand chars to shorts to allow for room

-    // when adding lines together

-    "vmovl.u8     q8, d4                       \n"

-    "vmovl.u8     q9, d5                       \n"

-    "vmovl.u8     q10, d6                      \n"

-    "vmovl.u8     q11, d7                      \n"

-    // 3 * line_0 + line_1

-    "vmlal.u8     q8, d0, d24                  \n"

-    "vmlal.u8     q9, d1, d24                  \n"

-    "vmlal.u8     q10, d2, d24                 \n"

-    "vmlal.u8     q11, d3, d24                 \n"

-    // (3 * line_0 + line_1) >> 2

-    "vqrshrn.u16  d0, q8, #2                   \n"

-    "vqrshrn.u16  d1, q9, #2                   \n"

-    "vqrshrn.u16  d2, q10, #2                  \n"

-    "vqrshrn.u16  d3, q11, #2                  \n"

-    // a0 = (src[0] * 3 + s[1] * 1) >> 2

-    "vmovl.u8     q8, d1                       \n"

-    "vmlal.u8     q8, d0, d24                  \n"

-    "vqrshrn.u16  d0, q8, #2                   \n"

-    // a1 = (src[1] * 1 + s[2] * 1) >> 1

-    "vrhadd.u8    d1, d1, d2                   \n"

-    // a2 = (src[2] * 1 + s[3] * 3) >> 2

-    "vmovl.u8     q8, d2                       \n"

-    "vmlal.u8     q8, d3, d24                  \n"

-    "vqrshrn.u16  d2, q8, #2                   \n"

-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"

-    "subs         %2, #24                      \n"

-    "bhi          1b                           \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width),        // %2

-      "+r"(src_stride)        // %3

-    :

-    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"

-  );

-}

-static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vmov.u8      d24, #3                      \n"

-    "add          %3, %0                       \n"

-    "1:                                        \n"

-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0

-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1

-    // average src line 0 with src line 1

-    "vrhadd.u8    q0, q0, q2                   \n"

-    "vrhadd.u8    q1, q1, q3                   \n"

-    // a0 = (src[0] * 3 + s[1] * 1) >> 2

-    "vmovl.u8     q3, d1                       \n"

-    "vmlal.u8     q3, d0, d24                  \n"

-    "vqrshrn.u16  d0, q3, #2                   \n"

-    // a1 = (src[1] * 1 + s[2] * 1) >> 1

-    "vrhadd.u8    d1, d1, d2                   \n"

-    // a2 = (src[2] * 1 + s[3] * 3) >> 2

-    "vmovl.u8     q3, d2                       \n"

-    "vmlal.u8     q3, d3, d24                  \n"

-    "vqrshrn.u16  d2, q3, #2                   \n"

-    "vst3.u8      {d0, d1, d2}, [%1]!          \n"

-    "subs         %2, #24                      \n"

-    "bhi          1b                           \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width),        // %2

-      "+r"(src_stride)        // %3

-    :

-    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"

-  );

-}

-#define HAS_SCALEROWDOWN38_NEON

-const uint8 shuf38[16] __attribute__ ((aligned(16))) =

-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };

-const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =

-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };

-const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =

-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,

-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };

-const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =

-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,

-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

-// 32 -> 12

-static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vld1.u8      {q3}, [%3]                   \n"

-    "1:                                        \n"

-    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"

-    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"

-    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"

-    "vst1.u8      {d4}, [%1]!                  \n"

-    "vst1.u32     {d5[0]}, [%1]!               \n"

-    "subs         %2, #12                      \n"

-    "bhi          1b                           \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width)         // %2

-    : "r"(shuf38)             // %3

-    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"

-  );

-}

-// 32x3 -> 12x1

-static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vld1.u16     {q13}, [%4]                  \n"

-    "vld1.u8      {q14}, [%5]                  \n"

-    "vld1.u8      {q15}, [%6]                  \n"

-    "add          r4, %0, %3, lsl #1           \n"

-    "add          %3, %0                       \n"

-    "1:                                        \n"

-    // d0 = 00 40 01 41 02 42 03 43

-    // d1 = 10 50 11 51 12 52 13 53

-    // d2 = 20 60 21 61 22 62 23 63

-    // d3 = 30 70 31 71 32 72 33 73

-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"

-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"

-    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"

-    // Shuffle the input data around to get align the data

-    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7

-    // d0 = 00 10 01 11 02 12 03 13

-    // d1 = 40 50 41 51 42 52 43 53

-    "vtrn.u8      d0, d1                       \n"

-    "vtrn.u8      d4, d5                       \n"

-    "vtrn.u8      d16, d17                     \n"

-    // d2 = 20 30 21 31 22 32 23 33

-    // d3 = 60 70 61 71 62 72 63 73

-    "vtrn.u8      d2, d3                       \n"

-    "vtrn.u8      d6, d7                       \n"

-    "vtrn.u8      d18, d19                     \n"

-    // d0 = 00+10 01+11 02+12 03+13

-    // d2 = 40+50 41+51 42+52 43+53

-    "vpaddl.u8    q0, q0                       \n"

-    "vpaddl.u8    q2, q2                       \n"

-    "vpaddl.u8    q8, q8                       \n"

-    // d3 = 60+70 61+71 62+72 63+73

-    "vpaddl.u8    d3, d3                       \n"

-    "vpaddl.u8    d7, d7                       \n"

-    "vpaddl.u8    d19, d19                     \n"

-    // combine source lines

-    "vadd.u16     q0, q2                       \n"

-    "vadd.u16     q0, q8                       \n"

-    "vadd.u16     d4, d3, d7                   \n"

-    "vadd.u16     d4, d19                      \n"

-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]

-    //             + s[6 + st * 1] + s[7 + st * 1]

-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6

-    "vqrdmulh.s16 q2, q13                      \n"

-    "vmovn.u16    d4, q2                       \n"

-    // Shuffle 2,3 reg around so that 2 can be added to the

-    //  0,1 reg and 3 can be added to the 4,5 reg.  This

-    //  requires expanding from u8 to u16 as the 0,1 and 4,5

-    //  registers are already expanded.  Then do transposes

-    //  to get aligned.

-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

-    "vmovl.u8     q1, d2                       \n"

-    "vmovl.u8     q3, d6                       \n"

-    "vmovl.u8     q9, d18                      \n"

-    // combine source lines

-    "vadd.u16     q1, q3                       \n"

-    "vadd.u16     q1, q9                       \n"

-    // d4 = xx 20 xx 30 xx 22 xx 32

-    // d5 = xx 21 xx 31 xx 23 xx 33

-    "vtrn.u32     d2, d3                       \n"

-    // d4 = xx 20 xx 21 xx 22 xx 23

-    // d5 = xx 30 xx 31 xx 32 xx 33

-    "vtrn.u16     d2, d3                       \n"

-    // 0+1+2, 3+4+5

-    "vadd.u16     q0, q1                       \n"

-    // Need to divide, but can't downshift as the the value

-    //  isn't a power of 2.  So multiply by 65536 / n

-    //  and take the upper 16 bits.

-    "vqrdmulh.s16 q0, q15                      \n"

-    // Align for table lookup, vtbl requires registers to

-    //  be adjacent

-    "vmov.u8      d2, d4                       \n"

-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

-    "vst1.u8      {d3}, [%1]!                  \n"

-    "vst1.u32     {d4[0]}, [%1]!               \n"

-    "subs         %2, #12                      \n"

-    "bhi          1b                           \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width),        // %2

-      "+r"(src_stride)        // %3

-    : "r"(mult38_div6),       // %4

-      "r"(shuf38_2),          // %5

-      "r"(mult38_div9)        // %6

-    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",

-      "q13", "q14", "q15", "memory", "cc"

-  );

-}

-// 32x2 -> 12x1

-static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vld1.u16     {q13}, [%4]                  \n"

-    "vld1.u8      {q14}, [%5]                  \n"

-    "add          %3, %0                       \n"

-    "1:                                        \n"

-    // d0 = 00 40 01 41 02 42 03 43

-    // d1 = 10 50 11 51 12 52 13 53

-    // d2 = 20 60 21 61 22 62 23 63

-    // d3 = 30 70 31 71 32 72 33 73

-    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"

-    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"

-    // Shuffle the input data around to get align the data

-    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7

-    // d0 = 00 10 01 11 02 12 03 13

-    // d1 = 40 50 41 51 42 52 43 53

-    "vtrn.u8      d0, d1                       \n"

-    "vtrn.u8      d4, d5                       \n"

-    // d2 = 20 30 21 31 22 32 23 33

-    // d3 = 60 70 61 71 62 72 63 73

-    "vtrn.u8      d2, d3                       \n"

-    "vtrn.u8      d6, d7                       \n"

-    // d0 = 00+10 01+11 02+12 03+13

-    // d2 = 40+50 41+51 42+52 43+53

-    "vpaddl.u8    q0, q0                       \n"

-    "vpaddl.u8    q2, q2                       \n"

-    // d3 = 60+70 61+71 62+72 63+73

-    "vpaddl.u8    d3, d3                       \n"

-    "vpaddl.u8    d7, d7                       \n"

-    // combine source lines

-    "vadd.u16     q0, q2                       \n"

-    "vadd.u16     d4, d3, d7                   \n"

-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4

-    "vqrshrn.u16  d4, q2, #2                   \n"

-    // Shuffle 2,3 reg around so that 2 can be added to the

-    //  0,1 reg and 3 can be added to the 4,5 reg.  This

-    //  requires expanding from u8 to u16 as the 0,1 and 4,5

-    //  registers are already expanded.  Then do transposes

-    //  to get aligned.

-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

-    "vmovl.u8     q1, d2                       \n"

-    "vmovl.u8     q3, d6                       \n"

-    // combine source lines

-    "vadd.u16     q1, q3                       \n"

-    // d4 = xx 20 xx 30 xx 22 xx 32

-    // d5 = xx 21 xx 31 xx 23 xx 33

-    "vtrn.u32     d2, d3                       \n"

-    // d4 = xx 20 xx 21 xx 22 xx 23

-    // d5 = xx 30 xx 31 xx 32 xx 33

-    "vtrn.u16     d2, d3                       \n"

-    // 0+1+2, 3+4+5

-    "vadd.u16     q0, q1                       \n"

-    // Need to divide, but can't downshift as the the value

-    //  isn't a power of 2.  So multiply by 65536 / n

-    //  and take the upper 16 bits.

-    "vqrdmulh.s16 q0, q13                      \n"

-    // Align for table lookup, vtbl requires registers to

-    //  be adjacent

-    "vmov.u8      d2, d4                       \n"

-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

-    "vst1.u8      {d3}, [%1]!                  \n"

-    "vst1.u32     {d4[0]}, [%1]!               \n"

-    "subs         %2, #12                      \n"

-    "bhi          1b                           \n"

-    : "+r"(src_ptr),          // %0

-      "+r"(dst_ptr),          // %1

-      "+r"(dst_width),        // %2

-      "+r"(src_stride)        // %3

-    : "r"(mult38_div6),       // %4

-      "r"(shuf38_2)           // %5

-    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"

-  );

-}

-/**

- * SSE2 downscalers with interpolation.

- *

- * Provided by Frank Barchard (fbarchard@google.com)

- *

- */

-// Constants for SSE2 code

-#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \

-    !defined(YUV_DISABLE_ASM)

-#if defined(_MSC_VER)

-#define TALIGN16(t, var) __declspec(align(16)) t _ ## var

-#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)

-#define TALIGN16(t, var) t var __attribute__((aligned(16)))

-#else

-#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))

-#endif

-#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \

-    defined(__i386__)

-#define DECLARE_FUNCTION(name)                                                 \

-    ".text                                     \n"                             \

-    ".globl _" #name "                         \n"                             \

-"_" #name ":                                   \n"

-#else

-#define DECLARE_FUNCTION(name)                                                 \

-    ".text                                     \n"                             \

-    ".global " #name "                         \n"                             \

-#name ":                                       \n"

-#endif

-// Offsets for source bytes 0 to 9

-//extern "C"

-TALIGN16(const uint8, shuf0[16]) =

-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

-// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

-//extern "C"

-TALIGN16(const uint8, shuf1[16]) =

-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

-//extern "C"

-TALIGN16(const uint8, shuf2[16]) =

-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

-// Offsets for source bytes 0 to 10

-//extern "C"

-TALIGN16(const uint8, shuf01[16]) =

-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

-// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

-//extern "C"

-TALIGN16(const uint8, shuf11[16]) =

-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

-// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

-//extern "C"

-TALIGN16(const uint8, shuf21[16]) =

-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

-// Coefficients for source bytes 0 to 10

-//extern "C"

-TALIGN16(const uint8, madd01[16]) =

-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

-// Coefficients for source bytes 10 to 21

-//extern "C"

-TALIGN16(const uint8, madd11[16]) =

-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

-// Coefficients for source bytes 21 to 31

-//extern "C"

-TALIGN16(const uint8, madd21[16]) =

-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

-// Coefficients for source bytes 21 to 31

-//extern "C"

-TALIGN16(const int16, round34[8]) =

-  { 2, 2, 2, 2, 2, 2, 2, 2 };

-//extern "C"

-TALIGN16(const uint8, shuf38a[16]) =

-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

-//extern "C"

-TALIGN16(const uint8, shuf38b[16]) =

-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

-// Arrange words 0,3,6 into 0,1,2

-//extern "C"

-TALIGN16(const uint8, shufac0[16]) =

-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

-// Arrange words 0,3,6 into 3,4,5

-//extern "C"

-TALIGN16(const uint8, shufac3[16]) =

-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

-// Scaling values for boxes of 3x3 and 2x3

-//extern "C"

-TALIGN16(const uint16, scaleac3[8]) =

-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

-// Arrange first value for pixels 0,1,2,3,4,5

-//extern "C"

-TALIGN16(const uint8, shufab0[16]) =

-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

-// Arrange second value for pixels 0,1,2,3,4,5

-//extern "C"

-TALIGN16(const uint8, shufab1[16]) =

-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

-// Arrange third value for pixels 0,1,2,3,4,5

-//extern "C"

-TALIGN16(const uint8, shufab2[16]) =

-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

-// Scaling values for boxes of 3x2 and 2x2

-//extern "C"

-TALIGN16(const uint16, scaleab2[8]) =

-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

-#endif

-#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) && defined(_MSC_VER)

-#define HAS_SCALEROWDOWN2_SSE2

-// Reads 32 pixels, throws half away and writes 16 pixels.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

-__declspec(naked)

-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  __asm {

-    mov        eax, [esp + 4]        // src_ptr

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_ptr

-    mov        ecx, [esp + 16]       // dst_width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

-    psrlw      xmm5, 8

-  wloop:

-    movdqa     xmm0, [eax]

-    movdqa     xmm1, [eax + 16]

-    lea        eax,  [eax + 32]

-    pand       xmm0, xmm5

-    pand       xmm1, xmm5

-    packuswb   xmm0, xmm1

-    movdqa     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 16

-    ja         wloop

-    ret

-  }

-}

-// Blends 32x2 rectangle to 16x1.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

-__declspec(naked)

-void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,

-                           uint8* dst_ptr, int dst_width) {

-  __asm {

-    push       esi

-    mov        eax, [esp + 4 + 4]    // src_ptr

-    mov        esi, [esp + 4 + 8]    // src_stride

-    mov        edx, [esp + 4 + 12]   // dst_ptr

-    mov        ecx, [esp + 4 + 16]   // dst_width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

-    psrlw      xmm5, 8

-  wloop:

-    movdqa     xmm0, [eax]

-    movdqa     xmm1, [eax + 16]

-    movdqa     xmm2, [eax + esi]

-    movdqa     xmm3, [eax + esi + 16]

-    lea        eax,  [eax + 32]

-    pavgb      xmm0, xmm2            // average rows

-    pavgb      xmm1, xmm3

-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

-    psrlw      xmm0, 8

-    movdqa     xmm3, xmm1

-    psrlw      xmm1, 8

-    pand       xmm2, xmm5

-    pand       xmm3, xmm5

-    pavgw      xmm0, xmm2

-    pavgw      xmm1, xmm3

-    packuswb   xmm0, xmm1

-    movdqa     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 16

-    ja         wloop

-    pop        esi

-    ret

-  }

-}

-#define HAS_SCALEROWDOWN4_SSE2

-// Point samples 32 pixels to 8 pixels.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

-__declspec(naked)

-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-                                     // src_stride ignored

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff

-    psrld      xmm5, 24

-  wloop:

-    movdqa     xmm0, [esi]

-    movdqa     xmm1, [esi + 16]

-    lea        esi,  [esi + 32]

-    pand       xmm0, xmm5

-    pand       xmm1, xmm5

-    packuswb   xmm0, xmm1

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi], xmm0

-    lea        edi, [edi + 8]

-    sub        ecx, 8

-    ja         wloop

-    popad

-    ret

-  }

-}

-// Blends 32x4 rectangle to 8x1.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

-__declspec(naked)

-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,

-                                  uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        ebx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff

-    psrlw      xmm7, 8

-    lea        edx, [ebx + ebx * 2]  // src_stride * 3

-  wloop:

-    movdqa     xmm0, [esi]

-    movdqa     xmm1, [esi + 16]

-    movdqa     xmm2, [esi + ebx]

-    movdqa     xmm3, [esi + ebx + 16]

-    pavgb      xmm0, xmm2            // average rows

-    pavgb      xmm1, xmm3

-    movdqa     xmm2, [esi + ebx * 2]

-    movdqa     xmm3, [esi + ebx * 2 + 16]

-    movdqa     xmm4, [esi + edx]

-    movdqa     xmm5, [esi + edx + 16]

-    lea        esi, [esi + 32]

-    pavgb      xmm2, xmm4

-    pavgb      xmm3, xmm5

-    pavgb      xmm0, xmm2

-    pavgb      xmm1, xmm3

-    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

-    psrlw      xmm0, 8

-    movdqa     xmm3, xmm1

-    psrlw      xmm1, 8

-    pand       xmm2, xmm7

-    pand       xmm3, xmm7

-    pavgw      xmm0, xmm2

-    pavgw      xmm1, xmm3

-    packuswb   xmm0, xmm1

-    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)

-    psrlw      xmm0, 8

-    pand       xmm2, xmm7

-    pavgw      xmm0, xmm2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi], xmm0

-    lea        edi, [edi + 8]

-    sub        ecx, 8

-    ja         wloop

-    popad

-    ret

-  }

-}

-#define HAS_SCALEROWDOWN8_SSE2

-// Point samples 32 pixels to 4 pixels.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.

-__declspec(naked)

-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-                                     // src_stride ignored

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes

-    psrlq      xmm5, 56

-  wloop:

-    movdqa     xmm0, [esi]

-    movdqa     xmm1, [esi + 16]

-    lea        esi,  [esi + 32]

-    pand       xmm0, xmm5

-    pand       xmm1, xmm5

-    packuswb   xmm0, xmm1  // 32->16

-    packuswb   xmm0, xmm0  // 16->8

-    packuswb   xmm0, xmm0  // 8->4

-    movd       dword ptr [edi], xmm0

-    lea        edi, [edi + 4]

-    sub        ecx, 4

-    ja         wloop

-    popad

-    ret

-  }

-}

-// Blends 32x8 rectangle to 4x1.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.

-__declspec(naked)

-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,

-                                  uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        ebx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    lea        edx, [ebx + ebx * 2]  // src_stride * 3

-    pxor       xmm7, xmm7

-  wloop:

-    movdqa     xmm0, [esi]           // average 8 rows to 1

-    movdqa     xmm1, [esi + 16]

-    movdqa     xmm2, [esi + ebx]

-    movdqa     xmm3, [esi + ebx + 16]

-    pavgb      xmm0, xmm2

-    pavgb      xmm1, xmm3

-    movdqa     xmm2, [esi + ebx * 2]

-    movdqa     xmm3, [esi + ebx * 2 + 16]

-    movdqa     xmm4, [esi + edx]

-    movdqa     xmm5, [esi + edx + 16]

-    lea        ebp, [esi + ebx * 4]

-    lea        esi, [esi + 32]

-    pavgb      xmm2, xmm4

-    pavgb      xmm3, xmm5

-    pavgb      xmm0, xmm2

-    pavgb      xmm1, xmm3

-    movdqa     xmm2, [ebp]

-    movdqa     xmm3, [ebp + 16]

-    movdqa     xmm4, [ebp + ebx]

-    movdqa     xmm5, [ebp + ebx + 16]

-    pavgb      xmm2, xmm4

-    pavgb      xmm3, xmm5

-    movdqa     xmm4, [ebp + ebx * 2]

-    movdqa     xmm5, [ebp + ebx * 2 + 16]

-    movdqa     xmm6, [ebp + edx]

-    pavgb      xmm4, xmm6

-    movdqa     xmm6, [ebp + edx + 16]

-    pavgb      xmm5, xmm6

-    pavgb      xmm2, xmm4

-    pavgb      xmm3, xmm5

-    pavgb      xmm0, xmm2

-    pavgb      xmm1, xmm3

-    psadbw     xmm0, xmm7            // average 32 pixels to 4

-    psadbw     xmm1, xmm7

-    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01

-    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx

-    por        xmm0, xmm1            //      -> 3201

-    psrlw      xmm0, 3

-    packuswb   xmm0, xmm0

-    packuswb   xmm0, xmm0

-    movd       dword ptr [edi], xmm0

-    lea        edi, [edi + 4]

-    sub        ecx, 4

-    ja         wloop

-    popad

-    ret

-  }

-}

-#define HAS_SCALEROWDOWN34_SSSE3

-// Point samples 32 pixels to 24 pixels.

-// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.

-// Then shuffled to do the scaling.

-// Note that movdqa+palign may be better than movdqu.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

-__declspec(naked)

-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

-                                 uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-                                     // src_stride ignored

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    movdqa     xmm3, _shuf0

-    movdqa     xmm4, _shuf1

-    movdqa     xmm5, _shuf2

-  wloop:

-    movdqa     xmm0, [esi]

-    movdqa     xmm1, [esi + 16]

-    lea        esi,  [esi + 32]

-    movdqa     xmm2, xmm1

-    palignr    xmm1, xmm0, 8

-    pshufb     xmm0, xmm3

-    pshufb     xmm1, xmm4

-    pshufb     xmm2, xmm5

-    movq       qword ptr [edi], xmm0

-    movq       qword ptr [edi + 8], xmm1

-    movq       qword ptr [edi + 16], xmm2

-    lea        edi, [edi + 24]

-    sub        ecx, 24

-    ja         wloop

-    popad

-    ret

-  }

-}

-// Blends 32x2 rectangle to 24x1

-// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.

-// Then shuffled to do the scaling.

-// Register usage:

-// xmm0 src_row 0

-// xmm1 src_row 1

-// xmm2 shuf 0

-// xmm3 shuf 1

-// xmm4 shuf 2

-// xmm5 madd 0

-// xmm6 madd 1

-// xmm7 round34

-// Note that movdqa+palign may be better than movdqu.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

-__declspec(naked)

-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        ebx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    movdqa     xmm2, _shuf01

-    movdqa     xmm3, _shuf11

-    movdqa     xmm4, _shuf21

-    movdqa     xmm5, _madd01

-    movdqa     xmm6, _madd11

-    movdqa     xmm7, _round34

-  wloop:

-    movdqa     xmm0, [esi]           // pixels 0..7

-    movdqa     xmm1, [esi+ebx]

-    pavgb      xmm0, xmm1

-    pshufb     xmm0, xmm2

-    pmaddubsw  xmm0, xmm5

-    paddsw     xmm0, xmm7

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi], xmm0

-    movdqu     xmm0, [esi+8]         // pixels 8..15

-    movdqu     xmm1, [esi+ebx+8]

-    pavgb      xmm0, xmm1

-    pshufb     xmm0, xmm3

-    pmaddubsw  xmm0, xmm6

-    paddsw     xmm0, xmm7

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi+8], xmm0

-    movdqa     xmm0, [esi+16]        // pixels 16..23

-    movdqa     xmm1, [esi+ebx+16]

-    lea        esi, [esi+32]

-    pavgb      xmm0, xmm1

-    pshufb     xmm0, xmm4

-    movdqa     xmm1, _madd21

-    pmaddubsw  xmm0, xmm1

-    paddsw     xmm0, xmm7

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi+16], xmm0

-    lea        edi, [edi+24]

-    sub        ecx, 24

-    ja         wloop

-    popad

-    ret

-  }

-}

-// Note that movdqa+palign may be better than movdqu.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

-__declspec(naked)

-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        ebx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    movdqa     xmm2, _shuf01

-    movdqa     xmm3, _shuf11

-    movdqa     xmm4, _shuf21

-    movdqa     xmm5, _madd01

-    movdqa     xmm6, _madd11

-    movdqa     xmm7, _round34

-  wloop:

-    movdqa     xmm0, [esi]           // pixels 0..7

-    movdqa     xmm1, [esi+ebx]

-    pavgb      xmm1, xmm0

-    pavgb      xmm0, xmm1

-    pshufb     xmm0, xmm2

-    pmaddubsw  xmm0, xmm5

-    paddsw     xmm0, xmm7

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi], xmm0

-    movdqu     xmm0, [esi+8]         // pixels 8..15

-    movdqu     xmm1, [esi+ebx+8]

-    pavgb      xmm1, xmm0

-    pavgb      xmm0, xmm1

-    pshufb     xmm0, xmm3

-    pmaddubsw  xmm0, xmm6

-    paddsw     xmm0, xmm7

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi+8], xmm0

-    movdqa     xmm0, [esi+16]        // pixels 16..23

-    movdqa     xmm1, [esi+ebx+16]

-    lea        esi, [esi+32]

-    pavgb      xmm1, xmm0

-    pavgb      xmm0, xmm1

-    pshufb     xmm0, xmm4

-    movdqa     xmm1, _madd21

-    pmaddubsw  xmm0, xmm1

-    paddsw     xmm0, xmm7

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edi+16], xmm0

-    lea        edi, [edi+24]

-    sub        ecx, 24

-    ja         wloop

-    popad

-    ret

-  }

-}

-#define HAS_SCALEROWDOWN38_SSSE3

-// 3/8 point sampler

-// Scale 32 pixels to 12

-__declspec(naked)

-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,

-                                 uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        edx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    movdqa     xmm4, _shuf38a

-    movdqa     xmm5, _shuf38b

-  xloop:

-    movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5

-    movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11

-    lea        esi, [esi + 32]

-    pshufb     xmm0, xmm4

-    pshufb     xmm1, xmm5

-    paddusb    xmm0, xmm1

-    movq       qword ptr [edi], xmm0 // write 12 pixels

-    movhlps    xmm1, xmm0

-    movd       [edi + 8], xmm1

-    lea        edi, [edi + 12]

-    sub        ecx, 12

-    ja         xloop

-    popad

-    ret

-  }

-}

-// Scale 16x3 pixels to 6x1 with interpolation

-__declspec(naked)

-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        edx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    movdqa     xmm4, _shufac0

-    movdqa     xmm5, _shufac3

-    movdqa     xmm6, _scaleac3

-    pxor       xmm7, xmm7

-  xloop:

-    movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1

-    movdqa     xmm2, [esi + edx]

-    movhlps    xmm1, xmm0

-    movhlps    xmm3, xmm2

-    punpcklbw  xmm0, xmm7

-    punpcklbw  xmm1, xmm7

-    punpcklbw  xmm2, xmm7

-    punpcklbw  xmm3, xmm7

-    paddusw    xmm0, xmm2

-    paddusw    xmm1, xmm3

-    movdqa     xmm2, [esi + edx * 2]

-    lea        esi, [esi + 16]

-    movhlps    xmm3, xmm2

-    punpcklbw  xmm2, xmm7

-    punpcklbw  xmm3, xmm7

-    paddusw    xmm0, xmm2

-    paddusw    xmm1, xmm3

-    movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2

-    psrldq     xmm0, 2

-    paddusw    xmm2, xmm0

-    psrldq     xmm0, 2

-    paddusw    xmm2, xmm0

-    pshufb     xmm2, xmm4

-    movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2

-    psrldq     xmm1, 2

-    paddusw    xmm3, xmm1

-    psrldq     xmm1, 2

-    paddusw    xmm3, xmm1

-    pshufb     xmm3, xmm5

-    paddusw    xmm2, xmm3

-    pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6

-    packuswb   xmm2, xmm2

-    movd       [edi], xmm2           // write 6 pixels

-    pextrw     eax, xmm2, 2

-    mov        [edi + 4], ax

-    lea        edi, [edi + 6]

-    sub        ecx, 6

-    ja         xloop

-    popad

-    ret

-  }

-}

-// Scale 16x2 pixels to 6x1 with interpolation

-__declspec(naked)

-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        edx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    movdqa     xmm4, _shufab0

-    movdqa     xmm5, _shufab1

-    movdqa     xmm6, _shufab2

-    movdqa     xmm7, _scaleab2

-  xloop:

-    movdqa     xmm2, [esi]           // average 2 rows into xmm2

-    pavgb      xmm2, [esi + edx]

-    lea        esi, [esi + 16]

-    movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0

-    pshufb     xmm0, xmm4

-    movdqa     xmm1, xmm2

-    pshufb     xmm1, xmm5

-    paddusw    xmm0, xmm1

-    pshufb     xmm2, xmm6

-    paddusw    xmm0, xmm2

-    pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2

-    packuswb   xmm0, xmm0

-    movd       [edi], xmm0           // write 6 pixels

-    pextrw     eax, xmm0, 2

-    mov        [edi + 4], ax

-    lea        edi, [edi + 6]

-    sub        ecx, 6

-    ja         xloop

-    popad

-    ret

-  }

-}

-#define HAS_SCALEADDROWS_SSE2

-// Reads 8xN bytes and produces 16 shorts at a time.

-__declspec(naked)

-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,

-                              uint16* dst_ptr, int src_width,

-                              int src_height) {

-  __asm {

-    pushad

-    mov        esi, [esp + 32 + 4]   // src_ptr

-    mov        edx, [esp + 32 + 8]   // src_stride

-    mov        edi, [esp + 32 + 12]  // dst_ptr

-    mov        ecx, [esp + 32 + 16]  // dst_width

-    mov        ebx, [esp + 32 + 20]  // height

-    pxor       xmm5, xmm5

-    dec        ebx

-  xloop:

-    // first row

-    movdqa     xmm2, [esi]

-    lea        eax, [esi + edx]

-    movhlps    xmm3, xmm2

-    mov        ebp, ebx

-    punpcklbw  xmm2, xmm5

-    punpcklbw  xmm3, xmm5

-    // sum remaining rows

-  yloop:

-    movdqa     xmm0, [eax]       // read 16 pixels

-    lea        eax, [eax + edx]  // advance to next row

-    movhlps    xmm1, xmm0

-    punpcklbw  xmm0, xmm5

-    punpcklbw  xmm1, xmm5

-    paddusw    xmm2, xmm0        // sum 16 words

-    paddusw    xmm3, xmm1

-    sub        ebp, 1

-    ja         yloop

-    movdqa     [edi], xmm2

-    movdqa     [edi + 16], xmm3

-    lea        edi, [edi + 32]

-    lea        esi, [esi + 16]

-    sub        ecx, 16

-    ja         xloop

-    popad

-    ret

-  }

-}

-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.

-#define HAS_SCALEFILTERROWS_SSE2

-__declspec(naked)

-static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                                 int src_stride, int dst_width,

-                                 int source_y_fraction) {

-  __asm {

-    push       esi

-    push       edi

-    mov        edi, [esp + 8 + 4]   // dst_ptr

-    mov        esi, [esp + 8 + 8]   // src_ptr

-    mov        edx, [esp + 8 + 12]  // src_stride

-    mov        ecx, [esp + 8 + 16]  // dst_width

-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

-    cmp        eax, 0

-    je         xloop1

-    cmp        eax, 128

-    je         xloop2

-    movd       xmm6, eax            // xmm6 = y fraction

-    punpcklwd  xmm6, xmm6

-    pshufd     xmm6, xmm6, 0

-    neg        eax                  // xmm5 = 256 - y fraction

-    add        eax, 256

-    movd       xmm5, eax

-    punpcklwd  xmm5, xmm5

-    pshufd     xmm5, xmm5, 0

-    pxor       xmm7, xmm7

-  xloop:

-    movdqa     xmm0, [esi]

-    movdqa     xmm2, [esi + edx]

-    lea        esi, [esi + 16]

-    movdqa     xmm1, xmm0

-    movdqa     xmm3, xmm2

-    punpcklbw  xmm0, xmm7

-    punpcklbw  xmm2, xmm7

-    punpckhbw  xmm1, xmm7

-    punpckhbw  xmm3, xmm7

-    pmullw     xmm0, xmm5           // scale row 0

-    pmullw     xmm1, xmm5

-    pmullw     xmm2, xmm6           // scale row 1

-    pmullw     xmm3, xmm6

-    paddusw    xmm0, xmm2           // sum rows

-    paddusw    xmm1, xmm3

-    psrlw      xmm0, 8

-    psrlw      xmm1, 8

-    packuswb   xmm0, xmm1

-    movdqa     [edi], xmm0

-    lea        edi, [edi + 16]

-    sub        ecx, 16

-    ja         xloop

-    mov        al, [edi - 1]

-    mov        [edi], al

-    pop        edi

-    pop        esi

-    ret

-  xloop1:

-    movdqa     xmm0, [esi]

-    lea        esi, [esi + 16]

-    movdqa     [edi], xmm0

-    lea        edi, [edi + 16]

-    sub        ecx, 16

-    ja         xloop1

-    mov        al, [edi - 1]

-    mov        [edi], al

-    pop        edi

-    pop        esi

-    ret

-  xloop2:

-    movdqa     xmm0, [esi]

-    movdqa     xmm2, [esi + edx]

-    lea        esi, [esi + 16]

-    pavgb      xmm0, xmm2

-    movdqa     [edi], xmm0

-    lea        edi, [edi + 16]

-    sub        ecx, 16

-    ja         xloop2

-    mov        al, [edi - 1]

-    mov        [edi], al

-    pop        edi

-    pop        esi

-    ret

-  }

-}

-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.

-#define HAS_SCALEFILTERROWS_SSSE3

-__declspec(naked)

-static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                                  int src_stride, int dst_width,

-                                  int source_y_fraction) {

-  __asm {

-    push       esi

-    push       edi

-    mov        edi, [esp + 8 + 4]   // dst_ptr

-    mov        esi, [esp + 8 + 8]   // src_ptr

-    mov        edx, [esp + 8 + 12]  // src_stride

-    mov        ecx, [esp + 8 + 16]  // dst_width

-    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

-    shr        eax, 1

-    cmp        eax, 0

-    je         xloop1

-    cmp        eax, 64

-    je         xloop2

-    mov        ah,al

-    neg        al

-    add        al, 128

-    movd       xmm5, eax

-    punpcklwd  xmm5, xmm5

-    pshufd     xmm5, xmm5, 0

-  xloop:

-    movdqa     xmm0, [esi]

-    movdqa     xmm2, [esi + edx]

-    lea        esi, [esi + 16]

-    movdqa     xmm1, xmm0

-    punpcklbw  xmm0, xmm2

-    punpckhbw  xmm1, xmm2

-    pmaddubsw  xmm0, xmm5

-    pmaddubsw  xmm1, xmm5

-    psrlw      xmm0, 7

-    psrlw      xmm1, 7

-    packuswb   xmm0, xmm1

-    movdqa     [edi], xmm0

-    lea        edi, [edi + 16]

-    sub        ecx, 16

-    ja         xloop

-    mov        al, [edi - 1]

-    mov        [edi], al

-    pop        edi

-    pop        esi

-    ret

-  xloop1:

-    movdqa     xmm0, [esi]

-    lea        esi, [esi + 16]

-    movdqa     [edi], xmm0

-    lea        edi, [edi + 16]

-    sub        ecx, 16

-    ja         xloop1

-    mov        al, [edi - 1]

-    mov        [edi], al

-    pop        edi

-    pop        esi

-    ret

-  xloop2:

-    movdqa     xmm0, [esi]

-    movdqa     xmm2, [esi + edx]

-    lea        esi, [esi + 16]

-    pavgb      xmm0, xmm2

-    movdqa     [edi], xmm0

-    lea        edi, [edi + 16]

-    sub        ecx, 16

-    ja         xloop2

-    mov        al, [edi - 1]

-    mov        [edi], al

-    pop        edi

-    pop        esi

-    ret

-  }

-}

-// Note that movdqa+palign may be better than movdqu.

-// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

-__declspec(naked)

-static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                                    int dst_width) {

-  __asm {

-    mov        edx, [esp + 4]    // dst_ptr

-    mov        eax, [esp + 8]    // src_ptr

-    mov        ecx, [esp + 12]   // dst_width

-    movdqa     xmm1, _round34

-    movdqa     xmm2, _shuf01

-    movdqa     xmm3, _shuf11

-    movdqa     xmm4, _shuf21

-    movdqa     xmm5, _madd01

-    movdqa     xmm6, _madd11

-    movdqa     xmm7, _madd21

-  wloop:

-    movdqa     xmm0, [eax]           // pixels 0..7

-    pshufb     xmm0, xmm2

-    pmaddubsw  xmm0, xmm5

-    paddsw     xmm0, xmm1

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edx], xmm0

-    movdqu     xmm0, [eax+8]         // pixels 8..15

-    pshufb     xmm0, xmm3

-    pmaddubsw  xmm0, xmm6

-    paddsw     xmm0, xmm1

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edx+8], xmm0

-    movdqa     xmm0, [eax+16]        // pixels 16..23

-    lea        eax, [eax+32]

-    pshufb     xmm0, xmm4

-    pmaddubsw  xmm0, xmm7

-    paddsw     xmm0, xmm1

-    psrlw      xmm0, 2

-    packuswb   xmm0, xmm0

-    movq       qword ptr [edx+16], xmm0

-    lea        edx, [edx+24]

-    sub        ecx, 24

-    ja         wloop

-    ret

-  }

-}

-#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)

-// GCC versions of row functions are verbatim conversions from Visual C.

-// Generated using gcc disassembly on Visual C object file:

-// objdump -D yuvscaler.obj >yuvscaler.txt

-#define HAS_SCALEROWDOWN2_SSE2

-static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "pcmpeqb    %%xmm5,%%xmm5                    \n"

-  "psrlw      $0x8,%%xmm5                      \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm1                  \n"

-  "lea        0x20(%0),%0                      \n"

-  "pand       %%xmm5,%%xmm0                    \n"

-  "pand       %%xmm5,%%xmm1                    \n"

-  "packuswb   %%xmm1,%%xmm0                    \n"

-  "movdqa     %%xmm0,(%1)                      \n"

-  "lea        0x10(%1),%1                      \n"

-  "sub        $0x10,%2                         \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :

-  : "memory", "cc"

-);

-}

-static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,

-                                  uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "pcmpeqb    %%xmm5,%%xmm5                    \n"

-  "psrlw      $0x8,%%xmm5                      \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm1                  \n"

-  "movdqa     (%0,%3,1),%%xmm2                 \n"

-  "movdqa     0x10(%0,%3,1),%%xmm3             \n"

-  "lea        0x20(%0),%0                      \n"

-  "pavgb      %%xmm2,%%xmm0                    \n"

-  "pavgb      %%xmm3,%%xmm1                    \n"

-  "movdqa     %%xmm0,%%xmm2                    \n"

-  "psrlw      $0x8,%%xmm0                      \n"

-  "movdqa     %%xmm1,%%xmm3                    \n"

-  "psrlw      $0x8,%%xmm1                      \n"

-  "pand       %%xmm5,%%xmm2                    \n"

-  "pand       %%xmm5,%%xmm3                    \n"

-  "pavgw      %%xmm2,%%xmm0                    \n"

-  "pavgw      %%xmm3,%%xmm1                    \n"

-  "packuswb   %%xmm1,%%xmm0                    \n"

-  "movdqa     %%xmm0,(%1)                      \n"

-  "lea        0x10(%1),%1                      \n"

-  "sub        $0x10,%2                         \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  : "r"((intptr_t)(src_stride))   // %3

-  : "memory", "cc"

-);

-}

-#define HAS_SCALEROWDOWN4_SSE2

-static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "pcmpeqb    %%xmm5,%%xmm5                    \n"

-  "psrld      $0x18,%%xmm5                     \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm1                  \n"

-  "lea        0x20(%0),%0                      \n"

-  "pand       %%xmm5,%%xmm0                    \n"

-  "pand       %%xmm5,%%xmm1                    \n"

-  "packuswb   %%xmm1,%%xmm0                    \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,(%1)                      \n"

-  "lea        0x8(%1),%1                       \n"

-  "sub        $0x8,%2                          \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :

-  : "memory", "cc"

-);

-}

-static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,

-                                  uint8* dst_ptr, int dst_width) {

-  intptr_t temp = 0;

-  asm volatile (

-  "pcmpeqb    %%xmm7,%%xmm7                    \n"

-  "psrlw      $0x8,%%xmm7                      \n"

-  "lea        (%4,%4,2),%3                     \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm1                  \n"

-  "movdqa     (%0,%4,1),%%xmm2                 \n"

-  "movdqa     0x10(%0,%4,1),%%xmm3             \n"

-  "pavgb      %%xmm2,%%xmm0                    \n"

-  "pavgb      %%xmm3,%%xmm1                    \n"

-  "movdqa     (%0,%4,2),%%xmm2                 \n"

-  "movdqa     0x10(%0,%4,2),%%xmm3             \n"

-  "movdqa     (%0,%3,1),%%xmm4                 \n"

-  "movdqa     0x10(%0,%3,1),%%xmm5             \n"

-  "lea        0x20(%0),%0                      \n"

-  "pavgb      %%xmm4,%%xmm2                    \n"

-  "pavgb      %%xmm2,%%xmm0                    \n"

-  "pavgb      %%xmm5,%%xmm3                    \n"

-  "pavgb      %%xmm3,%%xmm1                    \n"

-  "movdqa     %%xmm0,%%xmm2                    \n"

-  "psrlw      $0x8,%%xmm0                      \n"

-  "movdqa     %%xmm1,%%xmm3                    \n"

-  "psrlw      $0x8,%%xmm1                      \n"

-  "pand       %%xmm7,%%xmm2                    \n"

-  "pand       %%xmm7,%%xmm3                    \n"

-  "pavgw      %%xmm2,%%xmm0                    \n"

-  "pavgw      %%xmm3,%%xmm1                    \n"

-  "packuswb   %%xmm1,%%xmm0                    \n"

-  "movdqa     %%xmm0,%%xmm2                    \n"

-  "psrlw      $0x8,%%xmm0                      \n"

-  "pand       %%xmm7,%%xmm2                    \n"

-  "pavgw      %%xmm2,%%xmm0                    \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,(%1)                      \n"

-  "lea        0x8(%1),%1                       \n"

-  "sub        $0x8,%2                          \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width),   // %2

-    "+r"(temp)         // %3

-  : "r"((intptr_t)(src_stride))    // %4

-  : "memory", "cc"

-#if defined(__x86_64__)

-    , "xmm6", "xmm7"

-#endif

-);

-}

-#define HAS_SCALEROWDOWN8_SSE2

-static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "pcmpeqb    %%xmm5,%%xmm5                    \n"

-  "psrlq      $0x38,%%xmm5                     \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm1                  \n"

-  "lea        0x20(%0),%0                      \n"

-  "pand       %%xmm5,%%xmm0                    \n"

-  "pand       %%xmm5,%%xmm1                    \n"

-  "packuswb   %%xmm1,%%xmm0                    \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movd       %%xmm0,(%1)                      \n"

-  "lea        0x4(%1),%1                       \n"

-  "sub        $0x4,%2                          \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :

-  : "memory", "cc"

-);

-}

-#if defined(__i386__)

-void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,

-                                      uint8* dst_ptr, int dst_width);

-  asm(

-    DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x28(%esp),%ebx                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "lea    (%ebx,%ebx,2),%edx                 \n"

-    "pxor   %xmm7,%xmm7                        \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa 0x10(%esi),%xmm1                   \n"

-    "movdqa (%esi,%ebx,1),%xmm2                \n"

-    "movdqa 0x10(%esi,%ebx,1),%xmm3            \n"

-    "pavgb  %xmm2,%xmm0                        \n"

-    "pavgb  %xmm3,%xmm1                        \n"

-    "movdqa (%esi,%ebx,2),%xmm2                \n"

-    "movdqa 0x10(%esi,%ebx,2),%xmm3            \n"

-    "movdqa (%esi,%edx,1),%xmm4                \n"

-    "movdqa 0x10(%esi,%edx,1),%xmm5            \n"

-    "lea    (%esi,%ebx,4),%ebp                 \n"

-    "lea    0x20(%esi),%esi                    \n"

-    "pavgb  %xmm4,%xmm2                        \n"

-    "pavgb  %xmm5,%xmm3                        \n"

-    "pavgb  %xmm2,%xmm0                        \n"

-    "pavgb  %xmm3,%xmm1                        \n"

-    "movdqa 0x0(%ebp),%xmm2                    \n"

-    "movdqa 0x10(%ebp),%xmm3                   \n"

-    "movdqa 0x0(%ebp,%ebx,1),%xmm4             \n"

-    "movdqa 0x10(%ebp,%ebx,1),%xmm5            \n"

-    "pavgb  %xmm4,%xmm2                        \n"

-    "pavgb  %xmm5,%xmm3                        \n"

-    "movdqa 0x0(%ebp,%ebx,2),%xmm4             \n"

-    "movdqa 0x10(%ebp,%ebx,2),%xmm5            \n"

-    "movdqa 0x0(%ebp,%edx,1),%xmm6             \n"

-    "pavgb  %xmm6,%xmm4                        \n"

-    "movdqa 0x10(%ebp,%edx,1),%xmm6            \n"

-    "pavgb  %xmm6,%xmm5                        \n"

-    "pavgb  %xmm4,%xmm2                        \n"

-    "pavgb  %xmm5,%xmm3                        \n"

-    "pavgb  %xmm2,%xmm0                        \n"

-    "pavgb  %xmm3,%xmm1                        \n"

-    "psadbw %xmm7,%xmm0                        \n"

-    "psadbw %xmm7,%xmm1                        \n"

-    "pshufd $0xd8,%xmm0,%xmm0                  \n"

-    "pshufd $0x8d,%xmm1,%xmm1                  \n"

-    "por    %xmm1,%xmm0                        \n"

-    "psrlw  $0x3,%xmm0                         \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movd   %xmm0,(%edi)                       \n"

-    "lea    0x4(%edi),%edi                     \n"

-    "sub    $0x4,%ecx                          \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-// fpic is used for magiccam plugin

-#if !defined(__PIC__)

-#define HAS_SCALEROWDOWN34_SSSE3

-void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

-                                     uint8* dst_ptr, int dst_width);

-  asm(

-    DECLARE_FUNCTION(ScaleRowDown34_SSSE3)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "movdqa _shuf0,%xmm3                       \n"

-    "movdqa _shuf1,%xmm4                       \n"

-    "movdqa _shuf2,%xmm5                       \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa 0x10(%esi),%xmm2                   \n"

-    "lea    0x20(%esi),%esi                    \n"

-    "movdqa %xmm2,%xmm1                        \n"

-    "palignr $0x8,%xmm0,%xmm1                  \n"

-    "pshufb %xmm3,%xmm0                        \n"

-    "pshufb %xmm4,%xmm1                        \n"

-    "pshufb %xmm5,%xmm2                        \n"

-    "movq   %xmm0,(%edi)                       \n"

-    "movq   %xmm1,0x8(%edi)                    \n"

-    "movq   %xmm2,0x10(%edi)                   \n"

-    "lea    0x18(%edi),%edi                    \n"

-    "sub    $0x18,%ecx                         \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                           uint8* dst_ptr, int dst_width);

-  asm(

-    DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x28(%esp),%ebp                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "movdqa _shuf01,%xmm2                      \n"

-    "movdqa _shuf11,%xmm3                      \n"

-    "movdqa _shuf21,%xmm4                      \n"

-    "movdqa _madd01,%xmm5                      \n"

-    "movdqa _madd11,%xmm6                      \n"

-    "movdqa _round34,%xmm7                     \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa (%esi,%ebp),%xmm1                  \n"

-    "pavgb  %xmm1,%xmm0                        \n"

-    "pshufb %xmm2,%xmm0                        \n"

-    "pmaddubsw %xmm5,%xmm0                     \n"

-    "paddsw %xmm7,%xmm0                        \n"

-    "psrlw  $0x2,%xmm0                         \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movq   %xmm0,(%edi)                       \n"

-    "movdqu 0x8(%esi),%xmm0                    \n"

-    "movdqu 0x8(%esi,%ebp),%xmm1               \n"

-    "pavgb  %xmm1,%xmm0                        \n"

-    "pshufb %xmm3,%xmm0                        \n"

-    "pmaddubsw %xmm6,%xmm0                     \n"

-    "paddsw %xmm7,%xmm0                        \n"

-    "psrlw  $0x2,%xmm0                         \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movq   %xmm0,0x8(%edi)                    \n"

-    "movdqa 0x10(%esi),%xmm0                   \n"

-    "movdqa 0x10(%esi,%ebp),%xmm1              \n"

-    "lea    0x20(%esi),%esi                    \n"

-    "pavgb  %xmm1,%xmm0                        \n"

-    "pshufb %xmm4,%xmm0                        \n"

-    "movdqa  _madd21,%xmm1                     \n"

-    "pmaddubsw %xmm1,%xmm0                     \n"

-    "paddsw %xmm7,%xmm0                        \n"

-    "psrlw  $0x2,%xmm0                         \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movq   %xmm0,0x10(%edi)                   \n"

-    "lea    0x18(%edi),%edi                    \n"

-    "sub    $0x18,%ecx                         \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                           uint8* dst_ptr, int dst_width);

-  asm(

-    DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x28(%esp),%ebp                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "movdqa _shuf01,%xmm2                      \n"

-    "movdqa _shuf11,%xmm3                      \n"

-    "movdqa _shuf21,%xmm4                      \n"

-    "movdqa _madd01,%xmm5                      \n"

-    "movdqa _madd11,%xmm6                      \n"

-    "movdqa _round34,%xmm7                     \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa (%esi,%ebp,1),%xmm1                \n"

-    "pavgb  %xmm0,%xmm1                        \n"

-    "pavgb  %xmm1,%xmm0                        \n"

-    "pshufb %xmm2,%xmm0                        \n"

-    "pmaddubsw %xmm5,%xmm0                     \n"

-    "paddsw %xmm7,%xmm0                        \n"

-    "psrlw  $0x2,%xmm0                         \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movq   %xmm0,(%edi)                       \n"

-    "movdqu 0x8(%esi),%xmm0                    \n"

-    "movdqu 0x8(%esi,%ebp,1),%xmm1             \n"

-    "pavgb  %xmm0,%xmm1                        \n"

-    "pavgb  %xmm1,%xmm0                        \n"

-    "pshufb %xmm3,%xmm0                        \n"

-    "pmaddubsw %xmm6,%xmm0                     \n"

-    "paddsw %xmm7,%xmm0                        \n"

-    "psrlw  $0x2,%xmm0                         \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movq   %xmm0,0x8(%edi)                    \n"

-    "movdqa 0x10(%esi),%xmm0                   \n"

-    "movdqa 0x10(%esi,%ebp,1),%xmm1            \n"

-    "lea    0x20(%esi),%esi                    \n"

-    "pavgb  %xmm0,%xmm1                        \n"

-    "pavgb  %xmm1,%xmm0                        \n"

-    "pshufb %xmm4,%xmm0                        \n"

-    "movdqa  _madd21,%xmm1                     \n"

-    "pmaddubsw %xmm1,%xmm0                     \n"

-    "paddsw %xmm7,%xmm0                        \n"

-    "psrlw  $0x2,%xmm0                         \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movq   %xmm0,0x10(%edi)                   \n"

-    "lea    0x18(%edi),%edi                    \n"

-    "sub    $0x18,%ecx                         \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-#define HAS_SCALEROWDOWN38_SSSE3

-void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,

-                                     uint8* dst_ptr, int dst_width);

-  asm(

-    DECLARE_FUNCTION(ScaleRowDown38_SSSE3)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x28(%esp),%edx                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "movdqa _shuf38a ,%xmm4                    \n"

-    "movdqa _shuf38b ,%xmm5                    \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa 0x10(%esi),%xmm1                   \n"

-    "lea    0x20(%esi),%esi                    \n"

-    "pshufb %xmm4,%xmm0                        \n"

-    "pshufb %xmm5,%xmm1                        \n"

-    "paddusb %xmm1,%xmm0                       \n"

-    "movq   %xmm0,(%edi)                       \n"

-    "movhlps %xmm0,%xmm1                       \n"

-    "movd   %xmm1,0x8(%edi)                    \n"

-    "lea    0xc(%edi),%edi                     \n"

-    "sub    $0xc,%ecx                          \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                           uint8* dst_ptr, int dst_width);

-  asm(

-    DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x28(%esp),%edx                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "movdqa _shufac0,%xmm4                     \n"

-    "movdqa _shufac3,%xmm5                     \n"

-    "movdqa _scaleac3,%xmm6                    \n"

-    "pxor   %xmm7,%xmm7                        \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa (%esi,%edx,1),%xmm2                \n"

-    "movhlps %xmm0,%xmm1                       \n"

-    "movhlps %xmm2,%xmm3                       \n"

-    "punpcklbw %xmm7,%xmm0                     \n"

-    "punpcklbw %xmm7,%xmm1                     \n"

-    "punpcklbw %xmm7,%xmm2                     \n"

-    "punpcklbw %xmm7,%xmm3                     \n"

-    "paddusw %xmm2,%xmm0                       \n"

-    "paddusw %xmm3,%xmm1                       \n"

-    "movdqa (%esi,%edx,2),%xmm2                \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "movhlps %xmm2,%xmm3                       \n"

-    "punpcklbw %xmm7,%xmm2                     \n"

-    "punpcklbw %xmm7,%xmm3                     \n"

-    "paddusw %xmm2,%xmm0                       \n"

-    "paddusw %xmm3,%xmm1                       \n"

-    "movdqa %xmm0,%xmm2                        \n"

-    "psrldq $0x2,%xmm0                         \n"

-    "paddusw %xmm0,%xmm2                       \n"

-    "psrldq $0x2,%xmm0                         \n"

-    "paddusw %xmm0,%xmm2                       \n"

-    "pshufb %xmm4,%xmm2                        \n"

-    "movdqa %xmm1,%xmm3                        \n"

-    "psrldq $0x2,%xmm1                         \n"

-    "paddusw %xmm1,%xmm3                       \n"

-    "psrldq $0x2,%xmm1                         \n"

-    "paddusw %xmm1,%xmm3                       \n"

-    "pshufb %xmm5,%xmm3                        \n"

-    "paddusw %xmm3,%xmm2                       \n"

-    "pmulhuw %xmm6,%xmm2                       \n"

-    "packuswb %xmm2,%xmm2                      \n"

-    "movd   %xmm2,(%edi)                       \n"

-    "pextrw $0x2,%xmm2,%eax                    \n"

-    "mov    %ax,0x4(%edi)                      \n"

-    "lea    0x6(%edi),%edi                     \n"

-    "sub    $0x6,%ecx                          \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                           uint8* dst_ptr, int dst_width);

-  asm(

-    DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x28(%esp),%edx                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "movdqa _shufab0,%xmm4                     \n"

-    "movdqa _shufab1,%xmm5                     \n"

-    "movdqa _shufab2,%xmm6                     \n"

-    "movdqa _scaleab2,%xmm7                    \n"

-"1:"

-    "movdqa (%esi),%xmm2                       \n"

-    "pavgb  (%esi,%edx,1),%xmm2                \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "movdqa %xmm2,%xmm0                        \n"

-    "pshufb %xmm4,%xmm0                        \n"

-    "movdqa %xmm2,%xmm1                        \n"

-    "pshufb %xmm5,%xmm1                        \n"

-    "paddusw %xmm1,%xmm0                       \n"

-    "pshufb %xmm6,%xmm2                        \n"

-    "paddusw %xmm2,%xmm0                       \n"

-    "pmulhuw %xmm7,%xmm0                       \n"

-    "packuswb %xmm0,%xmm0                      \n"

-    "movd   %xmm0,(%edi)                       \n"

-    "pextrw $0x2,%xmm0,%eax                    \n"

-    "mov    %ax,0x4(%edi)                      \n"

-    "lea    0x6(%edi),%edi                     \n"

-    "sub    $0x6,%ecx                          \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-#endif // __PIC__

-#define HAS_SCALEADDROWS_SSE2

-void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,

-                                  uint16* dst_ptr, int src_width,

-                                  int src_height);

-  asm(

-    DECLARE_FUNCTION(ScaleAddRows_SSE2)

-    "pusha                                     \n"

-    "mov    0x24(%esp),%esi                    \n"

-    "mov    0x28(%esp),%edx                    \n"

-    "mov    0x2c(%esp),%edi                    \n"

-    "mov    0x30(%esp),%ecx                    \n"

-    "mov    0x34(%esp),%ebx                    \n"

-    "pxor   %xmm5,%xmm5                        \n"

-"1:"

-    "movdqa (%esi),%xmm2                       \n"

-    "lea    (%esi,%edx,1),%eax                 \n"

-    "movhlps %xmm2,%xmm3                       \n"

-    "lea    -0x1(%ebx),%ebp                    \n"

-    "punpcklbw %xmm5,%xmm2                     \n"

-    "punpcklbw %xmm5,%xmm3                     \n"

-"2:"

-    "movdqa (%eax),%xmm0                       \n"

-    "lea    (%eax,%edx,1),%eax                 \n"

-    "movhlps %xmm0,%xmm1                       \n"

-    "punpcklbw %xmm5,%xmm0                     \n"

-    "punpcklbw %xmm5,%xmm1                     \n"

-    "paddusw %xmm0,%xmm2                       \n"

-    "paddusw %xmm1,%xmm3                       \n"

-    "sub    $0x1,%ebp                          \n"

-    "ja     2b                                 \n"

-    "movdqa %xmm2,(%edi)                       \n"

-    "movdqa %xmm3,0x10(%edi)                   \n"

-    "lea    0x20(%edi),%edi                    \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "sub    $0x10,%ecx                         \n"

-    "ja     1b                                 \n"

-    "popa                                      \n"

-    "ret                                       \n"

-);

-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version

-#define HAS_SCALEFILTERROWS_SSE2

-void ScaleFilterRows_SSE2(uint8* dst_ptr,

-                                     const uint8* src_ptr, int src_stride,

-                                     int dst_width, int source_y_fraction);

-  asm(

-    DECLARE_FUNCTION(ScaleFilterRows_SSE2)

-    "push   %esi                               \n"

-    "push   %edi                               \n"

-    "mov    0xc(%esp),%edi                     \n"

-    "mov    0x10(%esp),%esi                    \n"

-    "mov    0x14(%esp),%edx                    \n"

-    "mov    0x18(%esp),%ecx                    \n"

-    "mov    0x1c(%esp),%eax                    \n"

-    "cmp    $0x0,%eax                          \n"

-    "je     2f                                 \n"

-    "cmp    $0x80,%eax                         \n"

-    "je     3f                                 \n"

-    "movd   %eax,%xmm6                         \n"

-    "punpcklwd %xmm6,%xmm6                     \n"

-    "pshufd $0x0,%xmm6,%xmm6                   \n"

-    "neg    %eax                               \n"

-    "add    $0x100,%eax                        \n"

-    "movd   %eax,%xmm5                         \n"

-    "punpcklwd %xmm5,%xmm5                     \n"

-    "pshufd $0x0,%xmm5,%xmm5                   \n"

-    "pxor   %xmm7,%xmm7                        \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa (%esi,%edx,1),%xmm2                \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "movdqa %xmm0,%xmm1                        \n"

-    "movdqa %xmm2,%xmm3                        \n"

-    "punpcklbw %xmm7,%xmm0                     \n"

-    "punpcklbw %xmm7,%xmm2                     \n"

-    "punpckhbw %xmm7,%xmm1                     \n"

-    "punpckhbw %xmm7,%xmm3                     \n"

-    "pmullw %xmm5,%xmm0                        \n"

-    "pmullw %xmm5,%xmm1                        \n"

-    "pmullw %xmm6,%xmm2                        \n"

-    "pmullw %xmm6,%xmm3                        \n"

-    "paddusw %xmm2,%xmm0                       \n"

-    "paddusw %xmm3,%xmm1                       \n"

-    "psrlw  $0x8,%xmm0                         \n"

-    "psrlw  $0x8,%xmm1                         \n"

-    "packuswb %xmm1,%xmm0                      \n"

-    "movdqa %xmm0,(%edi)                       \n"

-    "lea    0x10(%edi),%edi                    \n"

-    "sub    $0x10,%ecx                         \n"

-    "ja     1b                                 \n"

-    "mov    -0x1(%edi),%al                     \n"

-    "mov    %al,(%edi)                         \n"

-    "pop    %edi                               \n"

-    "pop    %esi                               \n"

-    "ret                                       \n"

-"2:"

-    "movdqa (%esi),%xmm0                       \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "movdqa %xmm0,(%edi)                       \n"

-    "lea    0x10(%edi),%edi                    \n"

-    "sub    $0x10,%ecx                         \n"

-    "ja     2b                                 \n"

-    "mov    -0x1(%edi),%al                     \n"

-    "mov    %al,(%edi)                         \n"

-    "pop    %edi                               \n"

-    "pop    %esi                               \n"

-    "ret                                       \n"

-"3:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa (%esi,%edx,1),%xmm2                \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "pavgb  %xmm2,%xmm0                        \n"

-    "movdqa %xmm0,(%edi)                       \n"

-    "lea    0x10(%edi),%edi                    \n"

-    "sub    $0x10,%ecx                         \n"

-    "ja     3b                                 \n"

-    "mov    -0x1(%edi),%al                     \n"

-    "mov    %al,(%edi)                         \n"

-    "pop    %edi                               \n"

-    "pop    %esi                               \n"

-    "ret                                       \n"

-);

-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version

-#define HAS_SCALEFILTERROWS_SSSE3

-void ScaleFilterRows_SSSE3(uint8* dst_ptr,

-                                      const uint8* src_ptr, int src_stride,

-                                      int dst_width, int source_y_fraction);

-  asm(

-    DECLARE_FUNCTION(ScaleFilterRows_SSSE3)

-    "push   %esi                               \n"

-    "push   %edi                               \n"

-    "mov    0xc(%esp),%edi                     \n"

-    "mov    0x10(%esp),%esi                    \n"

-    "mov    0x14(%esp),%edx                    \n"

-    "mov    0x18(%esp),%ecx                    \n"

-    "mov    0x1c(%esp),%eax                    \n"

-    "shr    %eax                               \n"

-    "cmp    $0x0,%eax                          \n"

-    "je     2f                                 \n"

-    "cmp    $0x40,%eax                         \n"

-    "je     3f                                 \n"

-    "mov    %al,%ah                            \n"

-    "neg    %al                                \n"

-    "add    $0x80,%al                          \n"

-    "movd   %eax,%xmm5                         \n"

-    "punpcklwd %xmm5,%xmm5                     \n"

-    "pshufd $0x0,%xmm5,%xmm5                   \n"

-"1:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa (%esi,%edx,1),%xmm2                \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "movdqa %xmm0,%xmm1                        \n"

-    "punpcklbw %xmm2,%xmm0                     \n"

-    "punpckhbw %xmm2,%xmm1                     \n"

-    "pmaddubsw %xmm5,%xmm0                     \n"

-    "pmaddubsw %xmm5,%xmm1                     \n"

-    "psrlw  $0x7,%xmm0                         \n"

-    "psrlw  $0x7,%xmm1                         \n"

-    "packuswb %xmm1,%xmm0                      \n"

-    "movdqa %xmm0,(%edi)                       \n"

-    "lea    0x10(%edi),%edi                    \n"

-    "sub    $0x10,%ecx                         \n"

-    "ja     1b                                 \n"

-    "mov    -0x1(%edi),%al                     \n"

-    "mov    %al,(%edi)                         \n"

-    "pop    %edi                               \n"

-    "pop    %esi                               \n"

-    "ret                                       \n"

-"2:"

-    "movdqa (%esi),%xmm0                       \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "movdqa %xmm0,(%edi)                       \n"

-    "lea    0x10(%edi),%edi                    \n"

-    "sub    $0x10,%ecx                         \n"

-    "ja     2b                                 \n"

-    "mov    -0x1(%edi),%al                     \n"

-    "mov    %al,(%edi)                         \n"

-    "pop    %edi                               \n"

-    "pop    %esi                               \n"

-    "ret                                       \n"

-"3:"

-    "movdqa (%esi),%xmm0                       \n"

-    "movdqa (%esi,%edx,1),%xmm2                \n"

-    "lea    0x10(%esi),%esi                    \n"

-    "pavgb  %xmm2,%xmm0                        \n"

-    "movdqa %xmm0,(%edi)                       \n"

-    "lea    0x10(%edi),%edi                    \n"

-    "sub    $0x10,%ecx                         \n"

-    "ja     3b                                 \n"

-    "mov    -0x1(%edi),%al                     \n"

-    "mov    %al,(%edi)                         \n"

-    "pop    %edi                               \n"

-    "pop    %esi                               \n"

-    "ret                                       \n"

-);

-#elif defined(__x86_64__)

-static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,

-                                  uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "lea        (%3,%3,2),%%r10                  \n"

-  "pxor       %%xmm7,%%xmm7                    \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm1                  \n"

-  "movdqa     (%0,%3,1),%%xmm2                 \n"

-  "movdqa     0x10(%0,%3,1),%%xmm3             \n"

-  "pavgb      %%xmm2,%%xmm0                    \n"

-  "pavgb      %%xmm3,%%xmm1                    \n"

-  "movdqa     (%0,%3,2),%%xmm2                 \n"

-  "movdqa     0x10(%0,%3,2),%%xmm3             \n"

-  "movdqa     (%0,%%r10,1),%%xmm4              \n"

-  "movdqa     0x10(%0,%%r10,1),%%xmm5          \n"

-  "lea        (%0,%3,4),%%r11                  \n"

-  "lea        0x20(%0),%0                      \n"

-  "pavgb      %%xmm4,%%xmm2                    \n"

-  "pavgb      %%xmm5,%%xmm3                    \n"

-  "pavgb      %%xmm2,%%xmm0                    \n"

-  "pavgb      %%xmm3,%%xmm1                    \n"

-  "movdqa     0x0(%%r11),%%xmm2                \n"

-  "movdqa     0x10(%%r11),%%xmm3               \n"

-  "movdqa     0x0(%%r11,%3,1),%%xmm4           \n"

-  "movdqa     0x10(%%r11,%3,1),%%xmm5          \n"

-  "pavgb      %%xmm4,%%xmm2                    \n"

-  "pavgb      %%xmm5,%%xmm3                    \n"

-  "movdqa     0x0(%%r11,%3,2),%%xmm4           \n"

-  "movdqa     0x10(%%r11,%3,2),%%xmm5          \n"

-  "movdqa     0x0(%%r11,%%r10,1),%%xmm6        \n"

-  "pavgb      %%xmm6,%%xmm4                    \n"

-  "movdqa     0x10(%%r11,%%r10,1),%%xmm6       \n"

-  "pavgb      %%xmm6,%%xmm5                    \n"

-  "pavgb      %%xmm4,%%xmm2                    \n"

-  "pavgb      %%xmm5,%%xmm3                    \n"

-  "pavgb      %%xmm2,%%xmm0                    \n"

-  "pavgb      %%xmm3,%%xmm1                    \n"

-  "psadbw     %%xmm7,%%xmm0                    \n"

-  "psadbw     %%xmm7,%%xmm1                    \n"

-  "pshufd     $0xd8,%%xmm0,%%xmm0              \n"

-  "pshufd     $0x8d,%%xmm1,%%xmm1              \n"

-  "por        %%xmm1,%%xmm0                    \n"

-  "psrlw      $0x3,%%xmm0                      \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movd       %%xmm0,(%1)                      \n"

-  "lea        0x4(%1),%1                       \n"

-  "sub        $0x4,%2                          \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"((intptr_t)(src_stride))   // %3

-  : "memory", "cc", "r10", "r11", "xmm6", "xmm7"

-);

-}

-#define HAS_SCALEROWDOWN34_SSSE3

-static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,

-                                 uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "movdqa     (%3),%%xmm3                      \n"

-  "movdqa     (%4),%%xmm4                      \n"

-  "movdqa     (%5),%%xmm5                      \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm2                  \n"

-  "lea        0x20(%0),%0                      \n"

-  "movdqa     %%xmm2,%%xmm1                    \n"

-  "palignr    $0x8,%%xmm0,%%xmm1               \n"

-  "pshufb     %%xmm3,%%xmm0                    \n"

-  "pshufb     %%xmm4,%%xmm1                    \n"

-  "pshufb     %%xmm5,%%xmm2                    \n"

-  "movq       %%xmm0,(%1)                      \n"

-  "movq       %%xmm1,0x8(%1)                   \n"

-  "movq       %%xmm2,0x10(%1)                  \n"

-  "lea        0x18(%1),%1                      \n"

-  "sub        $0x18,%2                         \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"(_shuf0),   // %3

-    "r"(_shuf1),   // %4

-    "r"(_shuf2)    // %5

-  : "memory", "cc"

-);

-}

-static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "movdqa     (%4),%%xmm2                      \n"  // _shuf01

-  "movdqa     (%5),%%xmm3                      \n"  // _shuf11

-  "movdqa     (%6),%%xmm4                      \n"  // _shuf21

-  "movdqa     (%7),%%xmm5                      \n"  // _madd01

-  "movdqa     (%8),%%xmm6                      \n"  // _madd11

-  "movdqa     (%9),%%xmm7                      \n"  // _round34

-  "movdqa     (%10),%%xmm8                     \n"  // _madd21

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     (%0,%3),%%xmm1                   \n"

-  "pavgb      %%xmm1,%%xmm0                    \n"

-  "pshufb     %%xmm2,%%xmm0                    \n"

-  "pmaddubsw  %%xmm5,%%xmm0                    \n"

-  "paddsw     %%xmm7,%%xmm0                    \n"

-  "psrlw      $0x2,%%xmm0                      \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,(%1)                      \n"

-  "movdqu     0x8(%0),%%xmm0                   \n"

-  "movdqu     0x8(%0,%3),%%xmm1                \n"

-  "pavgb      %%xmm1,%%xmm0                    \n"

-  "pshufb     %%xmm3,%%xmm0                    \n"

-  "pmaddubsw  %%xmm6,%%xmm0                    \n"

-  "paddsw     %%xmm7,%%xmm0                    \n"

-  "psrlw      $0x2,%%xmm0                      \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,0x8(%1)                   \n"

-  "movdqa     0x10(%0),%%xmm0                  \n"

-  "movdqa     0x10(%0,%3),%%xmm1               \n"

-  "lea        0x20(%0),%0                      \n"

-  "pavgb      %%xmm1,%%xmm0                    \n"

-  "pshufb     %%xmm4,%%xmm0                    \n"

-  "pmaddubsw  %%xmm8,%%xmm0                    \n"

-  "paddsw     %%xmm7,%%xmm0                    \n"

-  "psrlw      $0x2,%%xmm0                      \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,0x10(%1)                  \n"

-  "lea        0x18(%1),%1                      \n"

-  "sub        $0x18,%2                         \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"((intptr_t)(src_stride)),  // %3

-    "r"(_shuf01),   // %4

-    "r"(_shuf11),   // %5

-    "r"(_shuf21),   // %6

-    "r"(_madd01),   // %7

-    "r"(_madd11),   // %8

-    "r"(_round34),  // %9

-    "r"(_madd21)    // %10

-  : "memory", "cc", "xmm6", "xmm7", "xmm8"

-);

-}

-static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "movdqa     (%4),%%xmm2                      \n"  // _shuf01

-  "movdqa     (%5),%%xmm3                      \n"  // _shuf11

-  "movdqa     (%6),%%xmm4                      \n"  // _shuf21

-  "movdqa     (%7),%%xmm5                      \n"  // _madd01

-  "movdqa     (%8),%%xmm6                      \n"  // _madd11

-  "movdqa     (%9),%%xmm7                      \n"  // _round34

-  "movdqa     (%10),%%xmm8                     \n"  // _madd21

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     (%0,%3,1),%%xmm1                 \n"

-  "pavgb      %%xmm0,%%xmm1                    \n"

-  "pavgb      %%xmm1,%%xmm0                    \n"

-  "pshufb     %%xmm2,%%xmm0                    \n"

-  "pmaddubsw  %%xmm5,%%xmm0                    \n"

-  "paddsw     %%xmm7,%%xmm0                    \n"

-  "psrlw      $0x2,%%xmm0                      \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,(%1)                      \n"

-  "movdqu     0x8(%0),%%xmm0                   \n"

-  "movdqu     0x8(%0,%3,1),%%xmm1              \n"

-  "pavgb      %%xmm0,%%xmm1                    \n"

-  "pavgb      %%xmm1,%%xmm0                    \n"

-  "pshufb     %%xmm3,%%xmm0                    \n"

-  "pmaddubsw  %%xmm6,%%xmm0                    \n"

-  "paddsw     %%xmm7,%%xmm0                    \n"

-  "psrlw      $0x2,%%xmm0                      \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,0x8(%1)                   \n"

-  "movdqa     0x10(%0),%%xmm0                  \n"

-  "movdqa     0x10(%0,%3,1),%%xmm1             \n"

-  "lea        0x20(%0),%0                      \n"

-  "pavgb      %%xmm0,%%xmm1                    \n"

-  "pavgb      %%xmm1,%%xmm0                    \n"

-  "pshufb     %%xmm4,%%xmm0                    \n"

-  "pmaddubsw  %%xmm8,%%xmm0                    \n"

-  "paddsw     %%xmm7,%%xmm0                    \n"

-  "psrlw      $0x2,%%xmm0                      \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movq       %%xmm0,0x10(%1)                  \n"

-  "lea        0x18(%1),%1                      \n"

-  "sub        $0x18,%2                         \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"((intptr_t)(src_stride)),  // %3

-    "r"(_shuf01),   // %4

-    "r"(_shuf11),   // %5

-    "r"(_shuf21),   // %6

-    "r"(_madd01),   // %7

-    "r"(_madd11),   // %8

-    "r"(_round34),  // %9

-    "r"(_madd21)    // %10

-  : "memory", "cc", "xmm6", "xmm7", "xmm8"

-);

-}

-#define HAS_SCALEROWDOWN38_SSSE3

-static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,

-                                 uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "movdqa     (%3),%%xmm4                      \n"

-  "movdqa     (%4),%%xmm5                      \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     0x10(%0),%%xmm1                  \n"

-  "lea        0x20(%0),%0                      \n"

-  "pshufb     %%xmm4,%%xmm0                    \n"

-  "pshufb     %%xmm5,%%xmm1                    \n"

-  "paddusb    %%xmm1,%%xmm0                    \n"

-  "movq       %%xmm0,(%1)                      \n"

-  "movhlps    %%xmm0,%%xmm1                    \n"

-  "movd       %%xmm1,0x8(%1)                   \n"

-  "lea        0xc(%1),%1                       \n"

-  "sub        $0xc,%2                          \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"(_shuf38a),  // %3

-    "r"(_shuf38b)   // %4

-  : "memory", "cc"

-);

-}

-static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "movdqa     (%4),%%xmm4                      \n"

-  "movdqa     (%5),%%xmm5                      \n"

-  "movdqa     (%6),%%xmm6                      \n"

-  "pxor       %%xmm7,%%xmm7                    \n"

-"1:"

-  "movdqa     (%0),%%xmm0                      \n"

-  "movdqa     (%0,%3,1),%%xmm2                 \n"

-  "movhlps    %%xmm0,%%xmm1                    \n"

-  "movhlps    %%xmm2,%%xmm3                    \n"

-  "punpcklbw  %%xmm7,%%xmm0                    \n"

-  "punpcklbw  %%xmm7,%%xmm1                    \n"

-  "punpcklbw  %%xmm7,%%xmm2                    \n"

-  "punpcklbw  %%xmm7,%%xmm3                    \n"

-  "paddusw    %%xmm2,%%xmm0                    \n"

-  "paddusw    %%xmm3,%%xmm1                    \n"

-  "movdqa     (%0,%3,2),%%xmm2                 \n"

-  "lea        0x10(%0),%0                      \n"

-  "movhlps    %%xmm2,%%xmm3                    \n"

-  "punpcklbw  %%xmm7,%%xmm2                    \n"

-  "punpcklbw  %%xmm7,%%xmm3                    \n"

-  "paddusw    %%xmm2,%%xmm0                    \n"

-  "paddusw    %%xmm3,%%xmm1                    \n"

-  "movdqa     %%xmm0,%%xmm2                    \n"

-  "psrldq     $0x2,%%xmm0                      \n"

-  "paddusw    %%xmm0,%%xmm2                    \n"

-  "psrldq     $0x2,%%xmm0                      \n"

-  "paddusw    %%xmm0,%%xmm2                    \n"

-  "pshufb     %%xmm4,%%xmm2                    \n"

-  "movdqa     %%xmm1,%%xmm3                    \n"

-  "psrldq     $0x2,%%xmm1                      \n"

-  "paddusw    %%xmm1,%%xmm3                    \n"

-  "psrldq     $0x2,%%xmm1                      \n"

-  "paddusw    %%xmm1,%%xmm3                    \n"

-  "pshufb     %%xmm5,%%xmm3                    \n"

-  "paddusw    %%xmm3,%%xmm2                    \n"

-  "pmulhuw    %%xmm6,%%xmm2                    \n"

-  "packuswb   %%xmm2,%%xmm2                    \n"

-  "movd       %%xmm2,(%1)                      \n"

-  "pextrw     $0x2,%%xmm2,%%eax                \n"

-  "mov        %%ax,0x4(%1)                     \n"

-  "lea        0x6(%1),%1                       \n"

-  "sub        $0x6,%2                          \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"((intptr_t)(src_stride)),  // %3

-    "r"(_shufac0),   // %4

-    "r"(_shufac3),   // %5

-    "r"(_scaleac3)   // %6

-  : "memory", "cc", "rax", "xmm6", "xmm7"

-);

-}

-static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,

-                                       uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "movdqa     (%4),%%xmm4                      \n"

-  "movdqa     (%5),%%xmm5                      \n"

-  "movdqa     (%6),%%xmm6                      \n"

-  "movdqa     (%7),%%xmm7                      \n"

-"1:"

-  "movdqa     (%0),%%xmm2                      \n"

-  "pavgb      (%0,%3,1),%%xmm2                 \n"

-  "lea        0x10(%0),%0                      \n"

-  "movdqa     %%xmm2,%%xmm0                    \n"

-  "pshufb     %%xmm4,%%xmm0                    \n"

-  "movdqa     %%xmm2,%%xmm1                    \n"

-  "pshufb     %%xmm5,%%xmm1                    \n"

-  "paddusw    %%xmm1,%%xmm0                    \n"

-  "pshufb     %%xmm6,%%xmm2                    \n"

-  "paddusw    %%xmm2,%%xmm0                    \n"

-  "pmulhuw    %%xmm7,%%xmm0                    \n"

-  "packuswb   %%xmm0,%%xmm0                    \n"

-  "movd       %%xmm0,(%1)                      \n"

-  "pextrw     $0x2,%%xmm0,%%eax                \n"

-  "mov        %%ax,0x4(%1)                     \n"

-  "lea        0x6(%1),%1                       \n"

-  "sub        $0x6,%2                          \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"((intptr_t)(src_stride)),  // %3

-    "r"(_shufab0),   // %4

-    "r"(_shufab1),   // %5

-    "r"(_shufab2),   // %6

-    "r"(_scaleab2)   // %7

-  : "memory", "cc", "rax", "xmm6", "xmm7"

-);

-}

-#define HAS_SCALEADDROWS_SSE2

-static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,

-                              uint16* dst_ptr, int src_width,

-                              int src_height) {

-  asm volatile (

-  "pxor       %%xmm5,%%xmm5                    \n"

-"1:"

-  "movdqa     (%0),%%xmm2                      \n"

-  "lea        (%0,%4,1),%%r10                  \n"

-  "movhlps    %%xmm2,%%xmm3                    \n"

-  "lea        -0x1(%3),%%r11                   \n"

-  "punpcklbw  %%xmm5,%%xmm2                    \n"

-  "punpcklbw  %%xmm5,%%xmm3                    \n"

-"2:"

-  "movdqa     (%%r10),%%xmm0                   \n"

-  "lea        (%%r10,%4,1),%%r10               \n"

-  "movhlps    %%xmm0,%%xmm1                    \n"

-  "punpcklbw  %%xmm5,%%xmm0                    \n"

-  "punpcklbw  %%xmm5,%%xmm1                    \n"

-  "paddusw    %%xmm0,%%xmm2                    \n"

-  "paddusw    %%xmm1,%%xmm3                    \n"

-  "sub        $0x1,%%r11                       \n"

-  "ja         2b                               \n"

-  "movdqa     %%xmm2,(%1)                      \n"

-  "movdqa     %%xmm3,0x10(%1)                  \n"

-  "lea        0x20(%1),%1                      \n"

-  "lea        0x10(%0),%0                      \n"

-  "sub        $0x10,%2                         \n"

-  "ja         1b                               \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(src_width),   // %2

-    "+r"(src_height)   // %3

-  : "r"((intptr_t)(src_stride))  // %4

-  : "memory", "cc", "r10", "r11"

-);

-}

-// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version

-#define HAS_SCALEFILTERROWS_SSE2

-static void ScaleFilterRows_SSE2(uint8* dst_ptr,

-                                 const uint8* src_ptr, int src_stride,

-                                 int dst_width, int source_y_fraction) {

-  if (source_y_fraction == 0) {

-    asm volatile (

-    "1:"

-      "movdqa     (%1),%%xmm0                  \n"

-      "lea        0x10(%1),%1                  \n"

-      "movdqa     %%xmm0,(%0)                  \n"

-      "lea        0x10(%0),%0                  \n"

-      "sub        $0x10,%2                     \n"

-      "ja         1b                           \n"

-      "mov        -0x1(%0),%%al                \n"

-      "mov        %%al,(%0)                    \n"

-      : "+r"(dst_ptr),     // %0

-        "+r"(src_ptr),     // %1

-        "+r"(dst_width)    // %2

-      :

-      : "memory", "cc", "rax"

-    );

-    return;

-  } else if (source_y_fraction == 128) {

-    asm volatile (

-    "1:"

-      "movdqa     (%1),%%xmm0                  \n"

-      "movdqa     (%1,%3,1),%%xmm2             \n"

-      "lea        0x10(%1),%1                  \n"

-      "pavgb      %%xmm2,%%xmm0                \n"

-      "movdqa     %%xmm0,(%0)                  \n"

-      "lea        0x10(%0),%0                  \n"

-      "sub        $0x10,%2                     \n"

-      "ja         1b                           \n"

-      "mov        -0x1(%0),%%al                \n"

-      "mov        %%al,(%0)                    \n"

-      : "+r"(dst_ptr),     // %0

-        "+r"(src_ptr),     // %1

-        "+r"(dst_width)    // %2

-      : "r"((intptr_t)(src_stride))  // %3

-      : "memory", "cc", "rax"

-    );

-    return;

-  } else {

-    asm volatile (

-      "mov        %3,%%eax                     \n"

-      "movd       %%eax,%%xmm6                 \n"

-      "punpcklwd  %%xmm6,%%xmm6                \n"

-      "pshufd     $0x0,%%xmm6,%%xmm6           \n"

-      "neg        %%eax                        \n"

-      "add        $0x100,%%eax                 \n"

-      "movd       %%eax,%%xmm5                 \n"

-      "punpcklwd  %%xmm5,%%xmm5                \n"

-      "pshufd     $0x0,%%xmm5,%%xmm5           \n"

-      "pxor       %%xmm7,%%xmm7                \n"

-    "1:"

-      "movdqa     (%1),%%xmm0                  \n"

-      "movdqa     (%1,%4,1),%%xmm2             \n"

-      "lea        0x10(%1),%1                  \n"

-      "movdqa     %%xmm0,%%xmm1                \n"

-      "movdqa     %%xmm2,%%xmm3                \n"

-      "punpcklbw  %%xmm7,%%xmm0                \n"

-      "punpcklbw  %%xmm7,%%xmm2                \n"

-      "punpckhbw  %%xmm7,%%xmm1                \n"

-      "punpckhbw  %%xmm7,%%xmm3                \n"

-      "pmullw     %%xmm5,%%xmm0                \n"

-      "pmullw     %%xmm5,%%xmm1                \n"

-      "pmullw     %%xmm6,%%xmm2                \n"

-      "pmullw     %%xmm6,%%xmm3                \n"

-      "paddusw    %%xmm2,%%xmm0                \n"

-      "paddusw    %%xmm3,%%xmm1                \n"

-      "psrlw      $0x8,%%xmm0                  \n"

-      "psrlw      $0x8,%%xmm1                  \n"

-      "packuswb   %%xmm1,%%xmm0                \n"

-      "movdqa     %%xmm0,(%0)                  \n"

-      "lea        0x10(%0),%0                  \n"

-      "sub        $0x10,%2                     \n"

-      "ja         1b                           \n"

-      "mov        -0x1(%0),%%al                \n"

-      "mov        %%al,(%0)                    \n"

-      : "+r"(dst_ptr),     // %0

-        "+r"(src_ptr),     // %1

-        "+r"(dst_width),   // %2

-        "+r"(source_y_fraction)  // %3

-      : "r"((intptr_t)(src_stride))  // %4

-      : "memory", "cc", "rax", "xmm6", "xmm7"

-    );

-  }

-  return;

-}

-// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version

-#define HAS_SCALEFILTERROWS_SSSE3

-static void ScaleFilterRows_SSSE3(uint8* dst_ptr,

-                                  const uint8* src_ptr, int src_stride,

-                                  int dst_width, int source_y_fraction) {

-  source_y_fraction >>= 1;

-  if (source_y_fraction == 0) {

-    asm volatile (

-   "1:"

-      "movdqa     (%1),%%xmm0                  \n"

-      "lea        0x10(%1),%1                  \n"

-      "movdqa     %%xmm0,(%0)                  \n"

-      "lea        0x10(%0),%0                  \n"

-      "sub        $0x10,%2                     \n"

-      "ja         1b                           \n"

-      "mov        -0x1(%0),%%al                \n"

-      "mov        %%al,(%0)                    \n"

-      : "+r"(dst_ptr),     // %0

-        "+r"(src_ptr),     // %1

-        "+r"(dst_width)    // %2

-      :

-      : "memory", "cc", "rax"

-    );

-    return;

-  } else if (source_y_fraction == 64) {

-    asm volatile (

-    "1:"

-      "movdqa     (%1),%%xmm0                  \n"

-      "movdqa     (%1,%3,1),%%xmm2             \n"

-      "lea        0x10(%1),%1                  \n"

-      "pavgb      %%xmm2,%%xmm0                \n"

-      "movdqa     %%xmm0,(%0)                  \n"

-      "lea        0x10(%0),%0                  \n"

-      "sub        $0x10,%2                     \n"

-      "ja         1b                           \n"

-      "mov        -0x1(%0),%%al                \n"

-      "mov        %%al,(%0)                    \n"

-      : "+r"(dst_ptr),     // %0

-        "+r"(src_ptr),     // %1

-        "+r"(dst_width)    // %2

-      : "r"((intptr_t)(src_stride))  // %3

-     : "memory", "cc", "rax"

-    );

-    return;

-  } else {

-    asm volatile (

-      "mov        %3,%%eax                     \n"

-      "mov        %%al,%%ah                    \n"

-      "neg        %%al                         \n"

-      "add        $0x80,%%al                   \n"

-      "movd       %%eax,%%xmm5                 \n"

-      "punpcklwd  %%xmm5,%%xmm5                \n"

-      "pshufd     $0x0,%%xmm5,%%xmm5           \n"

-    "1:"

-      "movdqa     (%1),%%xmm0                  \n"

-      "movdqa     (%1,%4,1),%%xmm2             \n"

-      "lea        0x10(%1),%1                  \n"

-      "movdqa     %%xmm0,%%xmm1                \n"

-      "punpcklbw  %%xmm2,%%xmm0                \n"

-      "punpckhbw  %%xmm2,%%xmm1                \n"

-      "pmaddubsw  %%xmm5,%%xmm0                \n"

-      "pmaddubsw  %%xmm5,%%xmm1                \n"

-      "psrlw      $0x7,%%xmm0                  \n"

-      "psrlw      $0x7,%%xmm1                  \n"

-      "packuswb   %%xmm1,%%xmm0                \n"

-      "movdqa     %%xmm0,(%0)                  \n"

-      "lea        0x10(%0),%0                  \n"

-      "sub        $0x10,%2                     \n"

-      "ja         1b                           \n"

-      "mov        -0x1(%0),%%al                \n"

-      "mov        %%al,(%0)                    \n"

-      : "+r"(dst_ptr),     // %0

-        "+r"(src_ptr),     // %1

-        "+r"(dst_width),   // %2

-        "+r"(source_y_fraction)  // %3

-      : "r"((intptr_t)(src_stride))  // %4

-      : "memory", "cc", "rax"

-    );

-  }

-  return;

-}

-#endif

-#endif

-// CPU agnostic row functions

-static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,

-                            uint8* dst, int dst_width) {

-  int x;

-  for (x = 0; x < dst_width; ++x) {

-    *dst++ = *src_ptr;

-    src_ptr += 2;

-  }

-}

-static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,

-                               uint8* dst, int dst_width) {

-  int x;

-  for (x = 0; x < dst_width; ++x) {

-    *dst++ = (src_ptr[0] + src_ptr[1] +

-              src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;

-    src_ptr += 2;

-  }

-}

-static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,

-                            uint8* dst, int dst_width) {

-  int x;

-  for (x = 0; x < dst_width; ++x) {

-    *dst++ = *src_ptr;

-    src_ptr += 4;

-  }

-}

-static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,

-                               uint8* dst, int dst_width) {

-  int x;

-  for (x = 0; x < dst_width; ++x) {

-    *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

-              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +

-              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +

-              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +

-              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +

-              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +

-              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +

-              8) >> 4;

-    src_ptr += 4;

-  }

-}

-// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.

-// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.

-// The following 2 lines cause error on Windows.

-//static const int kMaxOutputWidth = 640;

-//static const int kMaxRow12 = 1280;         //kMaxOutputWidth * 2;

-#define kMaxOutputWidth   640

-#define kMaxRow12         1280

-static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,

-                            uint8* dst, int dst_width) {

-  int x;

-  for (x = 0; x < dst_width; ++x) {

-    *dst++ = *src_ptr;

-    src_ptr += 8;

-  }

-}

-// Note calling code checks width is less than max and if not

-// uses ScaleRowDown8_C instead.

-static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,

-                               uint8* dst, int dst_width) {

-  ALIGN16(uint8 src_row[kMaxRow12 * 2]);

-  assert(dst_width <= kMaxOutputWidth);

-  ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);

-  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,

-                     src_row + kMaxOutputWidth,

-                     dst_width * 2);

-  ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);

-}

-static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,

-                             uint8* dst, int dst_width) {

-  uint8* dend;

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  dend = dst + dst_width;

-  do {

-    dst[0] = src_ptr[0];

-    dst[1] = src_ptr[1];

-    dst[2] = src_ptr[3];

-    dst += 3;

-    src_ptr += 4;

-  } while (dst < dend);

-}

-// Filter rows 0 and 1 together, 3 : 1

-static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,

-                                   uint8* d, int dst_width) {

-  uint8* dend;

-  const uint8* s;

-  const uint8* t;

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  dend = d + dst_width;

-  s = src_ptr;

-  t = src_ptr + src_stride;

-  do {

-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

-    d[0] = (a0 * 3 + b0 + 2) >> 2;

-    d[1] = (a1 * 3 + b1 + 2) >> 2;

-    d[2] = (a2 * 3 + b2 + 2) >> 2;

-    d += 3;

-    s += 4;

-    t += 4;

-  } while (d < dend);

-}

-// Filter rows 1 and 2 together, 1 : 1

-static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,

-                                   uint8* d, int dst_width) {

-  uint8* dend;

-  const uint8* s;

-  const uint8* t;

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  dend = d + dst_width;

-  s = src_ptr;

-  t = src_ptr + src_stride;

-  do {

-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

-    d[0] = (a0 + b0 + 1) >> 1;

-    d[1] = (a1 + b1 + 1) >> 1;

-    d[2] = (a2 + b2 + 1) >> 1;

-    d += 3;

-    s += 4;

-    t += 4;

-  } while (d < dend);

-}

-#if defined(HAS_SCALEFILTERROWS_SSE2)

-// Filter row to 3/4

-static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,

-                                int dst_width) {

-  uint8* dend;

-  const uint8* s;

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  dend = dst_ptr + dst_width;

-  s = src_ptr;

-  do {

-    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;

-    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;

-    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;

-    dst_ptr += 3;

-    s += 4;

-  } while (dst_ptr < dend);

-}

-#endif

-static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,

-                              int dst_width, int dx) {

-  int x = 0;

-  int j;

-  for (j = 0; j < dst_width; ++j) {

-    int xi = x >> 16;

-    int xf1 = x & 0xffff;

-    int xf0 = 65536 - xf1;

-    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;

-    x += dx;

-  }

-}

-//Not work on Windows

-//static const int kMaxInputWidth = 2560;

-#define kMaxInputWidth    2560

-#if defined(HAS_SCALEFILTERROWS_SSE2)

-#define HAS_SCALEROWDOWN34_SSE2

-// Filter rows 0 and 1 together, 3 : 1

-static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  ALIGN16(uint8 row[kMaxInputWidth]);

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);

-  ScaleFilterCols34_C(dst_ptr, row, dst_width);

-}

-// Filter rows 1 and 2 together, 1 : 1

-static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  ALIGN16(uint8 row[kMaxInputWidth]);

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);

-  ScaleFilterCols34_C(dst_ptr, row, dst_width);

-}

-#endif

-static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,

-                             uint8* dst, int dst_width) {

-  int x;

-  assert(dst_width % 3 == 0);

-  for (x = 0; x < dst_width; x += 3) {

-    dst[0] = src_ptr[0];

-    dst[1] = src_ptr[3];

-    dst[2] = src_ptr[6];

-    dst += 3;

-    src_ptr += 8;

-  }

-}

-// 8x3 -> 3x1

-static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,

-                                   uint8* dst_ptr, int dst_width) {

-  int i;

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  for (i = 0; i < dst_width; i+=3) {

-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +

-        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +

-        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *

-        (65536 / 9) >> 16;

-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +

-        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +

-        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *

-        (65536 / 9) >> 16;

-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +

-        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *

-        (65536 / 6) >> 16;

-    src_ptr += 8;

-    dst_ptr += 3;

-  }

-}

-// 8x2 -> 3x1

-static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,

-                                   uint8* dst_ptr, int dst_width) {

-  int i;

-  assert((dst_width % 3 == 0) && (dst_width > 0));

-  for (i = 0; i < dst_width; i+=3) {

-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

-        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +

-        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;

-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

-        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +

-        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;

-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

-        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *

-        (65536 / 4) >> 16;

-    src_ptr += 8;

-    dst_ptr += 3;

-  }

-}

-// C version 8x2 -> 8x1

-static void ScaleFilterRows_C(uint8* dst_ptr,

-                              const uint8* src_ptr, int src_stride,

-                              int dst_width, int source_y_fraction) {

-  int y1_fraction;

-  int y0_fraction;

-  const uint8* src_ptr1;

-  uint8* end;

-  assert(dst_width > 0);

-  y1_fraction = source_y_fraction;

-  y0_fraction = 256 - y1_fraction;

-  src_ptr1 = src_ptr + src_stride;

-  end = dst_ptr + dst_width;

-  do {

-    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;

-    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;

-    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;

-    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;

-    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;

-    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;

-    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;

-    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;

-    src_ptr += 8;

-    src_ptr1 += 8;

-    dst_ptr += 8;

-  } while (dst_ptr < end);

-  dst_ptr[0] = dst_ptr[-1];

-}

-void ScaleAddRows_C(const uint8* src_ptr, int src_stride,

-                    uint16* dst_ptr, int src_width, int src_height) {

-  int x,y;

-  assert(src_width > 0);

-  assert(src_height > 0);

-  for (x = 0; x < src_width; ++x) {

-    const uint8* s = src_ptr + x;

-    int sum = 0;

-    for (y = 0; y < src_height; ++y) {

-      sum += s[0];

-      s += src_stride;

-    }

-    dst_ptr[x] = sum;

-  }

-}

-/**

- * Scale plane, 1/2

- *

- * This is an optimized version for scaling down a plane to 1/2 of

- * its original size.

- *

- */

-static void ScalePlaneDown2(int src_width, int src_height,

-                            int dst_width, int dst_height,

-                            int src_stride, int dst_stride,

-                            const uint8* src_ptr, uint8* dst_ptr,

-                            FilterModeEnum filtering) {

-  void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,

-                        uint8* dst_ptr, int dst_width);

-  assert(IS_ALIGNED(src_width, 2));

-  assert(IS_ALIGNED(src_height, 2));

-#if defined(HAS_SCALEROWDOWN2_NEON)

-  if (TestCpuFlag(kCpuHasNEON) &&

-      IS_ALIGNED(dst_width, 16)) {

-    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;

-  } else

-#endif

-#if defined(HAS_SCALEROWDOWN2_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) &&

-      IS_ALIGNED(dst_width, 16) &&

-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

-      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

-    ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;

-  } else

-#endif

-  {

-    ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;

-  }

-  {

-    int y;

-    for (y = 0; y < dst_height; ++y) {

-      ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);

-      src_ptr += (src_stride << 1);

-      dst_ptr += dst_stride;

-    }

-  }

-}

-/**

- * Scale plane, 1/4

- *

- * This is an optimized version for scaling down a plane to 1/4 of

- * its original size.

- */

-static void ScalePlaneDown4(int src_width, int src_height,

-                            int dst_width, int dst_height,

-                            int src_stride, int dst_stride,

-                            const uint8* src_ptr, uint8* dst_ptr,

-                            FilterModeEnum filtering) {

-  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,

-                        uint8* dst_ptr, int dst_width);

-  assert(IS_ALIGNED(src_width, 4));

-  assert(IS_ALIGNED(src_height, 4));

-#if defined(HAS_SCALEROWDOWN4_NEON)

-  if (TestCpuFlag(kCpuHasNEON) &&

-      IS_ALIGNED(dst_width, 4)) {

-    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;

-  } else

-#endif

-#if defined(HAS_SCALEROWDOWN4_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) &&

-      IS_ALIGNED(dst_width, 8) &&

-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

-      IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {

-    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;

-  } else

-#endif

-  {

-    ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;

-  }

-  {

-    int y;

-    for (y = 0; y < dst_height; ++y) {

-      ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);

-      src_ptr += (src_stride << 2);

-      dst_ptr += dst_stride;

-    }

-  }

-}

-/**

- * Scale plane, 1/8

- *

- * This is an optimized version for scaling down a plane to 1/8

- * of its original size.

- *

- */

-static void ScalePlaneDown8(int src_width, int src_height,

-                            int dst_width, int dst_height,

-                            int src_stride, int dst_stride,

-                            const uint8* src_ptr, uint8* dst_ptr,

-                            FilterModeEnum filtering) {

-  void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,

-                        uint8* dst_ptr, int dst_width);

-  assert(IS_ALIGNED(src_width, 8));

-  assert(IS_ALIGNED(src_height, 8));

-#if defined(HAS_SCALEROWDOWN8_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) &&

-      IS_ALIGNED(dst_width, 4) &&

-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;

-  } else

-#endif

-  {

-    ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?

-        ScaleRowDown8Int_C : ScaleRowDown8_C;

-  }

-  {

-    int y;

-    for (y = 0; y < dst_height; ++y) {

-      ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);

-      src_ptr += (src_stride << 3);

-      dst_ptr += dst_stride;

-    }

-  }

-}

-/**

- * Scale plane down, 3/4

- *

- * Provided by Frank Barchard (fbarchard@google.com)

- *

- */

-static void ScalePlaneDown34(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint8* src_ptr, uint8* dst_ptr,

-                             FilterModeEnum filtering) {

-  void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,

-                           uint8* dst_ptr, int dst_width);

-  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,

-                           uint8* dst_ptr, int dst_width);

-  assert(dst_width % 3 == 0);

-#if defined(HAS_SCALEROWDOWN34_NEON)

-  if (TestCpuFlag(kCpuHasNEON) &&

-      (dst_width % 24 == 0)) {

-    if (!filtering) {

-      ScaleRowDown34_0 = ScaleRowDown34_NEON;

-      ScaleRowDown34_1 = ScaleRowDown34_NEON;

-    } else {

-      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;

-      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;

-    }

-  } else

-#endif

-#if defined(HAS_SCALEROWDOWN34_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3) &&

-      (dst_width % 24 == 0) &&

-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

-      IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {

-    if (!filtering) {

-      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;

-      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;

-    } else {

-      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;

-      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;

-    }

-  } else

-#endif

-#if defined(HAS_SCALEROWDOWN34_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) &&

-      (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&

-      IS_ALIGNED(dst_stride, 8) &&

-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&

-      filtering) {

-    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;

-    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;

-  } else

-#endif

-  {

-    if (!filtering) {

-      ScaleRowDown34_0 = ScaleRowDown34_C;

-      ScaleRowDown34_1 = ScaleRowDown34_C;

-    } else {

-      ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;

-      ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;

-    }

-  }

-  {

-  int src_row = 0;

-    int y;

-    for (y = 0; y < dst_height; ++y) {

-    switch (src_row) {

-      case 0:

-        ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);

-        break;

-      case 1:

-        ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);

-        break;

-      case 2:

-        ScaleRowDown34_0(src_ptr + src_stride, -src_stride,

-                         dst_ptr, dst_width);

-        break;

-    }

-    ++src_row;

-    src_ptr += src_stride;

-    dst_ptr += dst_stride;

-    if (src_row >= 3) {

-      src_ptr += src_stride;

-      src_row = 0;

-    }

-  }

-}

-}

-/**

- * Scale plane, 3/8

- *

- * This is an optimized version for scaling down a plane to 3/8

- * of its original size.

- *

- * Reduces 16x3 to 6x1

- */

-static void ScalePlaneDown38(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint8* src_ptr, uint8* dst_ptr,

-                             FilterModeEnum filtering) {

-  void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,

-                           uint8* dst_ptr, int dst_width);

-  void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,

-                           uint8* dst_ptr, int dst_width);

-  assert(dst_width % 3 == 0);

-#if defined(HAS_SCALEROWDOWN38_NEON)

-  if (TestCpuFlag(kCpuHasNEON) &&

-      (dst_width % 12 == 0)) {

-    if (!filtering) {

-      ScaleRowDown38_3 = ScaleRowDown38_NEON;

-      ScaleRowDown38_2 = ScaleRowDown38_NEON;

-    } else {

-      ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;

-      ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;

-    }

-  } else

-#endif

-#if defined(HAS_SCALEROWDOWN38_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3) &&

-      (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&

-      IS_ALIGNED(dst_stride, 8) &&

-      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {

-    if (!filtering) {

-      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;

-      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;

-    } else {

-      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;

-      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;

-    }

-  } else

-#endif

-  {

-    if (!filtering) {

-      ScaleRowDown38_3 = ScaleRowDown38_C;

-      ScaleRowDown38_2 = ScaleRowDown38_C;

-    } else {

-      ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;

-      ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;

-    }

-  }

-  {

-  int src_row = 0;

-    int y;

-    for (y = 0; y < dst_height; ++y) {

-    switch (src_row) {

-      case 0:

-      case 1:

-        ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);

-        src_ptr += src_stride * 3;

-        ++src_row;

-        break;

-      case 2:

-        ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);

-        src_ptr += src_stride * 2;

-        src_row = 0;

-        break;

-    }

-    dst_ptr += dst_stride;

-  }

-}

-}

-__inline static uint32 SumBox(int iboxwidth, int iboxheight,

-                            int src_stride, const uint8* src_ptr) {

-  int x, y;

-  uint32 sum;

-  assert(iboxwidth > 0);

-  assert(iboxheight > 0);

-  sum = 0u;

-  for (y = 0; y < iboxheight; ++y) {

-    for (x = 0; x < iboxwidth; ++x) {

-      sum += src_ptr[x];

-    }

-    src_ptr += src_stride;

-  }

-  return sum;

-}

-static void ScalePlaneBoxRow(int dst_width, int boxheight,

-                             int dx, int src_stride,

-                             const uint8* src_ptr, uint8* dst_ptr) {

-  int x = 0;

-  int i;

-  for (i = 0; i < dst_width; ++i) {

-    int ix = x >> 16;

-    int boxwidth;

-    x += dx;

-    boxwidth = (x >> 16) - ix;

-    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /

-        (boxwidth * boxheight);

-  }

-}

-__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {

-  uint32 sum;

-  int x;

-  assert(iboxwidth > 0);

-  sum = 0u;

-  for (x = 0; x < iboxwidth; ++x) {

-    sum += src_ptr[x];

-  }

-  return sum;

-}

-static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,

-                            const uint16* src_ptr, uint8* dst_ptr) {

-  int scaletbl[2];

-  int minboxwidth = (dx >> 16);

-  scaletbl[0] = 65536 / (minboxwidth * boxheight);

-  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);

-  {

-  int *scaleptr = scaletbl - minboxwidth;

-  int x = 0;

-    int i;

-    for (i = 0; i < dst_width; ++i) {

-    int ix = x >> 16;

-      int boxwidth;

-    x += dx;

-      boxwidth = (x >> 16) - ix;

-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;

-    }

-  }

-}

-static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,

-                            const uint16* src_ptr, uint8* dst_ptr) {

-  int boxwidth = (dx >> 16);

-  int scaleval = 65536 / (boxwidth * boxheight);

-  int x = 0;

-  int i;

-  for (i = 0; i < dst_width; ++i) {

-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;

-    x += boxwidth;

-  }

-}

-/**

- * Scale plane down to any dimensions, with interpolation.

- * (boxfilter).

- *

- * Same method as SimpleScale, which is fixed point, outputting

- * one pixel of destination using fixed point (16.16) to step

- * through source, sampling a box of pixel with simple

- * averaging.

- */

-static void ScalePlaneBox(int src_width, int src_height,

-                          int dst_width, int dst_height,

-                          int src_stride, int dst_stride,

-                          const uint8* src_ptr, uint8* dst_ptr) {

-  int dx, dy;

-  assert(dst_width > 0);

-  assert(dst_height > 0);

-  dy = (src_height << 16) / dst_height;

-  dx = (src_width << 16) / dst_width;

-  if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||

-      dst_height * 2 > src_height) {

-    uint8* dst = dst_ptr;

-    int dy = (src_height << 16) / dst_height;

-    int dx = (src_width << 16) / dst_width;

-    int y = 0;

-    int j;

-    for (j = 0; j < dst_height; ++j) {

-      int iy = y >> 16;

-      const uint8* const src = src_ptr + iy * src_stride;

-      int boxheight;

-      y += dy;

-      if (y > (src_height << 16)) {

-        y = (src_height << 16);

-      }

-      boxheight = (y >> 16) - iy;

-      ScalePlaneBoxRow(dst_width, boxheight,

-                       dx, src_stride,

-                       src, dst);

-      dst += dst_stride;

-    }

-  } else {

-    ALIGN16(uint16 row[kMaxInputWidth]);

-    void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,

-                         uint16* dst_ptr, int src_width, int src_height);

-    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,

-                         const uint16* src_ptr, uint8* dst_ptr);

-#if defined(HAS_SCALEADDROWS_SSE2)

-    if (TestCpuFlag(kCpuHasSSE2) &&

-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&

-        IS_ALIGNED(src_width, 16)) {

-      ScaleAddRows = ScaleAddRows_SSE2;

-    } else

-#endif

-    {

-      ScaleAddRows = ScaleAddRows_C;

-    }

-    if (dx & 0xffff) {

-      ScaleAddCols = ScaleAddCols2_C;

-    } else {

-      ScaleAddCols = ScaleAddCols1_C;

-    }

-    {

-    int y = 0;

-      int j;

-      for (j = 0; j < dst_height; ++j) {

-      int iy = y >> 16;

-      const uint8* const src = src_ptr + iy * src_stride;

-        int boxheight;

-      y += dy;

-      if (y > (src_height << 16)) {

-        y = (src_height << 16);

-      }

-        boxheight = (y >> 16) - iy;

-      ScaleAddRows(src, src_stride, row, src_width, boxheight);

-      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);

-      dst_ptr += dst_stride;

-      }

-    }

-  }

-}

-/**

- * Scale plane to/from any dimensions, with interpolation.

- */

-static void ScalePlaneBilinearSimple(int src_width, int src_height,

-                                     int dst_width, int dst_height,

-                                     int src_stride, int dst_stride,

-                                     const uint8* src_ptr, uint8* dst_ptr) {

-  int i, j;

-  uint8* dst = dst_ptr;

-  int dx = (src_width << 16) / dst_width;

-  int dy = (src_height << 16) / dst_height;

-  int maxx = ((src_width - 1) << 16) - 1;

-  int maxy = ((src_height - 1) << 16) - 1;

-  int y = (dst_height < src_height) ? 32768 :

-      (src_height << 16) / dst_height - 32768;

-  for (i = 0; i < dst_height; ++i) {

-    int cy = (y < 0) ? 0 : y;

-    int yi = cy >> 16;

-    int yf = cy & 0xffff;

-    const uint8* const src = src_ptr + yi * src_stride;

-    int x = (dst_width < src_width) ? 32768 :

-        (src_width << 16) / dst_width - 32768;

-    for (j = 0; j < dst_width; ++j) {

-      int cx = (x < 0) ? 0 : x;

-      int xi = cx >> 16;

-      int xf = cx & 0xffff;

-      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;

-      int r1 = (src[xi + src_stride] * (65536 - xf) +

-          src[xi + src_stride + 1] * xf) >> 16;

-      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;

-      x += dx;

-      if (x > maxx)

-        x = maxx;

-    }

-    dst += dst_stride - dst_width;

-    y += dy;

-    if (y > maxy)

-      y = maxy;

-  }

-}

-/**

- * Scale plane to/from any dimensions, with bilinear

- * interpolation.

- */

-static void ScalePlaneBilinear(int src_width, int src_height,

-                               int dst_width, int dst_height,

-                               int src_stride, int dst_stride,

-                               const uint8* src_ptr, uint8* dst_ptr) {

-  int dy;

-  int dx;

-  assert(dst_width > 0);

-  assert(dst_height > 0);

-  dy = (src_height << 16) / dst_height;

-  dx = (src_width << 16) / dst_width;

-  if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {

-    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,

-                             src_stride, dst_stride, src_ptr, dst_ptr);

-  } else {

-    ALIGN16(uint8 row[kMaxInputWidth + 1]);

-    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,

-                            int src_stride,

-                            int dst_width, int source_y_fraction);

-    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,

-                            int dst_width, int dx);

-#if defined(HAS_SCALEFILTERROWS_SSSE3)

-    if (TestCpuFlag(kCpuHasSSSE3) &&

-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&

-        IS_ALIGNED(src_width, 16)) {

-      ScaleFilterRows = ScaleFilterRows_SSSE3;

-    } else

-#endif

-#if defined(HAS_SCALEFILTERROWS_SSE2)

-    if (TestCpuFlag(kCpuHasSSE2) &&

-        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&

-        IS_ALIGNED(src_width, 16)) {

-      ScaleFilterRows = ScaleFilterRows_SSE2;

-    } else

-#endif

-    {

-      ScaleFilterRows = ScaleFilterRows_C;

-    }

-    ScaleFilterCols = ScaleFilterCols_C;

-    {

-    int y = 0;

-    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.

-      int j;

-      for (j = 0; j < dst_height; ++j) {

-      int iy = y >> 16;

-      int fy = (y >> 8) & 255;

-      const uint8* const src = src_ptr + iy * src_stride;

-      ScaleFilterRows(row, src, src_stride, src_width, fy);

-      ScaleFilterCols(dst_ptr, row, dst_width, dx);

-      dst_ptr += dst_stride;

-      y += dy;

-      if (y > maxy) {

-        y = maxy;

-      }

-    }

-  }

-}

-}

-/**

- * Scale plane to/from any dimensions, without interpolation.

- * Fixed point math is used for performance: The upper 16 bits

- * of x and dx is the integer part of the source position and

- * the lower 16 bits are the fixed decimal part.

- */

-static void ScalePlaneSimple(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint8* src_ptr, uint8* dst_ptr) {

-  uint8* dst = dst_ptr;

-  int dx = (src_width << 16) / dst_width;

-  int y;

-  for (y = 0; y < dst_height; ++y) {

-    const uint8* const src = src_ptr + (y * src_height / dst_height) *

-        src_stride;

-    // TODO(fbarchard): Round X coordinate by setting x=0x8000.

-    int x = 0;

-    int i;

-    for (i = 0; i < dst_width; ++i) {

-      *dst++ = src[x >> 16];

-      x += dx;

-    }

-    dst += dst_stride - dst_width;

-  }

-}

-/**

- * Scale plane to/from any dimensions.

- */

-static void ScalePlaneAnySize(int src_width, int src_height,

-                              int dst_width, int dst_height,

-                              int src_stride, int dst_stride,

-                              const uint8* src_ptr, uint8* dst_ptr,

-                              FilterModeEnum filtering) {

-  if (!filtering) {

-    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,

-                     src_stride, dst_stride, src_ptr, dst_ptr);

-  } else {

-    // fall back to non-optimized version

-    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,

-                       src_stride, dst_stride, src_ptr, dst_ptr);

-  }

-}

-/**

- * Scale plane down, any size

- *

- * This is an optimized version for scaling down a plane to any size.

- * The current implementation is ~10 times faster compared to the

- * reference implementation for e.g. XGA->LowResPAL

- *

- */

-static void ScalePlaneDown(int src_width, int src_height,

-                           int dst_width, int dst_height,

-                           int src_stride, int dst_stride,

-                           const uint8* src_ptr, uint8* dst_ptr,

-                           FilterModeEnum filtering) {

-  if (!filtering) {

-    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,

-                     src_stride, dst_stride, src_ptr, dst_ptr);

-  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {

-    // between 1/2x and 1x use bilinear

-    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,

-                       src_stride, dst_stride, src_ptr, dst_ptr);

-  } else {

-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,

-                  src_stride, dst_stride, src_ptr, dst_ptr);

-  }

-}

-/**

- * Copy plane, no scaling

- *

- * This simply copies the given plane without scaling.

- * The current implementation is ~115 times faster

- * compared to the reference implementation.

- *

- */

-static void CopyPlane(int src_width, int src_height,

-                      int dst_width, int dst_height,

-                      int src_stride, int dst_stride,

-                      const uint8* src_ptr, uint8* dst_ptr) {

-  if (src_stride == src_width && dst_stride == dst_width) {

-    // All contiguous, so can use REALLY fast path.

-    memcpy(dst_ptr, src_ptr, src_width * src_height);

-  } else {

-    // Not all contiguous; must copy scanlines individually

-    const uint8* src = src_ptr;

-    uint8* dst = dst_ptr;

-    int i;

-    for (i = 0; i < src_height; ++i) {

-      memcpy(dst, src, src_width);

-      dst += dst_stride;

-      src += src_stride;

-    }

-  }

-}

-static void ScalePlane(const uint8* src, int src_stride,

-                       int src_width, int src_height,

-                       uint8* dst, int dst_stride,

-                       int dst_width, int dst_height,

-                       FilterModeEnum filtering, int use_ref) {

-  // Use specialized scales to improve performance for common resolutions.

-  // For example, all the 1/2 scalings will use ScalePlaneDown2()

-  if (dst_width == src_width && dst_height == src_height) {

-    // Straight copy.

-    CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,

-              dst_stride, src, dst);

-  } else if (dst_width <= src_width && dst_height <= src_height) {

-    // Scale down.

-    if (use_ref) {

-      // For testing, allow the optimized versions to be disabled.

-      ScalePlaneDown(src_width, src_height, dst_width, dst_height,

-                     src_stride, dst_stride, src, dst, filtering);

-    } else if (4 * dst_width == 3 * src_width &&

-               4 * dst_height == 3 * src_height) {

-      // optimized, 3/4

-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,

-                       src_stride, dst_stride, src, dst, filtering);

-    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {

-      // optimized, 1/2

-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,

-                      src_stride, dst_stride, src, dst, filtering);

-    // 3/8 rounded up for odd sized chroma height.

-    } else if (8 * dst_width == 3 * src_width &&

-               dst_height == ((src_height * 3 + 7) / 8)) {

-      // optimized, 3/8

-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,

-                       src_stride, dst_stride, src, dst, filtering);

-    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {

-      // optimized, 1/4

-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,

-                      src_stride, dst_stride, src, dst, filtering);

-    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {

-      // optimized, 1/8

-      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,

-                      src_stride, dst_stride, src, dst, filtering);

-    } else {

-      // Arbitrary downsample

-      ScalePlaneDown(src_width, src_height, dst_width, dst_height,

-                     src_stride, dst_stride, src, dst, filtering);

-    }

-  } else {

-    // Arbitrary scale up and/or down.

-    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,

-                      src_stride, dst_stride, src, dst, filtering);

-  }

-}

-/**

- * Scale a plane.

- *

- * This function in turn calls a scaling function

- * suitable for handling the desired resolutions.

- *

- */

-int I420Scale(const uint8* src_y, int src_stride_y,

-              const uint8* src_u, int src_stride_u,

-              const uint8* src_v, int src_stride_v,

-              int src_width, int src_height,

-              uint8* dst_y, int dst_stride_y,

-              uint8* dst_u, int dst_stride_u,

-              uint8* dst_v, int dst_stride_v,

-              int dst_width, int dst_height,

-              FilterModeEnum filtering) {

-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||

-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (src_height < 0) {

-    int halfheight;

-    src_height = -src_height;

-    halfheight = (src_height + 1) >> 1;

-    src_y = src_y + (src_height - 1) * src_stride_y;

-    src_u = src_u + (halfheight - 1) * src_stride_u;

-    src_v = src_v + (halfheight - 1) * src_stride_v;

-    src_stride_y = -src_stride_y;

-    src_stride_u = -src_stride_u;

-    src_stride_v = -src_stride_v;

-  }

-  {

-  int src_halfwidth = (src_width + 1) >> 1;

-  int src_halfheight = (src_height + 1) >> 1;

-  int dst_halfwidth = (dst_width + 1) >> 1;

-  int dst_halfheight = (dst_height + 1) >> 1;

-    ScalePlane(src_y, src_stride_y, src_width, src_height,

-               dst_y, dst_stride_y, dst_width, dst_height,

-               filtering, use_reference_impl_);

-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,

-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

-             filtering, use_reference_impl_);

-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,

-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

-             filtering, use_reference_impl_);

-  }

-  return 0;

-}

-// Deprecated api

-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,

-          int src_stride_y, int src_stride_u, int src_stride_v,

-          int src_width, int src_height,

-          uint8* dst_y, uint8* dst_u, uint8* dst_v,

-          int dst_stride_y, int dst_stride_u, int dst_stride_v,

-          int dst_width, int dst_height,

-          int interpolate) {

-  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||

-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {

-    return -1;

-  }

-  // Negative height means invert the image.

-  if (src_height < 0) {

-    int halfheight;

-    src_height = -src_height;

-    halfheight = (src_height + 1) >> 1;

-    src_y = src_y + (src_height - 1) * src_stride_y;

-    src_u = src_u + (halfheight - 1) * src_stride_u;

-    src_v = src_v + (halfheight - 1) * src_stride_v;

-    src_stride_y = -src_stride_y;

-    src_stride_u = -src_stride_u;

-    src_stride_v = -src_stride_v;

-  }

-  {

-  int src_halfwidth = (src_width + 1) >> 1;

-  int src_halfheight = (src_height + 1) >> 1;

-  int dst_halfwidth = (dst_width + 1) >> 1;

-  int dst_halfheight = (dst_height + 1) >> 1;

-  FilterModeEnum filtering = interpolate ? kFilterBox : kFilterNone;

-  ScalePlane(src_y, src_stride_y, src_width, src_height,

-             dst_y, dst_stride_y, dst_width, dst_height,

-             filtering, use_reference_impl_);

-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,

-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

-             filtering, use_reference_impl_);

-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,

-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

-             filtering, use_reference_impl_);

-  }

-  return 0;

-}

-// Deprecated api

-int ScaleOffset(const uint8* src, int src_width, int src_height,

-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,

-          int interpolate) {

-  if (!src || src_width <= 0 || src_height <= 0 ||

-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||

-      dst_yoffset >= dst_height) {

-    return -1;

-  }

-  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.

-  {

-  int src_halfwidth = (src_width + 1) >> 1;

-  int src_halfheight = (src_height + 1) >> 1;

-  int dst_halfwidth = (dst_width + 1) >> 1;

-  int dst_halfheight = (dst_height + 1) >> 1;

-  int aheight = dst_height - dst_yoffset * 2;  // actual output height

-  const uint8* const src_y = src;

-  const uint8* const src_u = src + src_width * src_height;

-  const uint8* const src_v = src + src_width * src_height +

-                             src_halfwidth * src_halfheight;

-  uint8* dst_y = dst + dst_yoffset * dst_width;

-  uint8* dst_u = dst + dst_width * dst_height +

-                 (dst_yoffset >> 1) * dst_halfwidth;

-  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +

-                 (dst_yoffset >> 1) * dst_halfwidth;

-  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,

-               src_width, src_height, dst_y, dst_u, dst_v, dst_width,

-               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);

-  }

-}

-#ifdef __cplusplus

-}  // extern "C"

-}  // namespace libyuv

-#endif

--- /dev/null

+++ b/third_party/libyuv/source/scale.cc

@@ -1,0 +1,1716 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/scale.h"

+#include <assert.h>

+#include <string.h>

+#include "third_party/libyuv/include/libyuv/cpu_id.h"

+#include "third_party/libyuv/include/libyuv/planar_functions.h"  // CopyPlane

+#include "third_party/libyuv/include/libyuv/row.h"

+#include "third_party/libyuv/include/libyuv/scale_row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// Remove this macro if OVERREAD is safe.

+#define AVOID_OVERREAD 1

+static __inline int Abs(int v) {

+  return v >= 0 ? v : -v;

+}

+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)

+// Scale plane, 1/2

+// This is an optimized version for scaling down a plane to 1/2 of

+// its original size.

+static void ScalePlaneDown2(int src_width, int src_height,

+                            int dst_width, int dst_height,

+                            int src_stride, int dst_stride,

+                            const uint8* src_ptr, uint8* dst_ptr,

+                            enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) =

+    filtering == kFilterNone ? ScaleRowDown2_C :

+        (filtering == kFilterLinear ? ScaleRowDown2Linear_C :

+        ScaleRowDown2Box_C);

+  int row_stride = src_stride << 1;

+  if (!filtering) {

+    src_ptr += src_stride;  // Point to odd rows.

+    src_stride = 0;

+  }

+#if defined(HAS_SCALEROWDOWN2_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {

+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON;

+  }

+#elif defined(HAS_SCALEROWDOWN2_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {

+    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 :

+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 :

+        ScaleRowDown2Box_Unaligned_SSE2);

+    if (IS_ALIGNED(src_ptr, 16) &&

+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&

+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 :

+          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 :

+          ScaleRowDown2Box_SSE2);

+    }

+  }

+#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&

+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    ScaleRowDown2 = filtering ?

+        ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2;

+  }

+#endif

+  if (filtering == kFilterLinear) {

+    src_stride = 0;

+  }

+  // TODO(fbarchard): Loop through source height to allow odd height.

+  for (y = 0; y < dst_height; ++y) {

+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);

+    src_ptr += row_stride;

+    dst_ptr += dst_stride;

+  }

+}

+static void ScalePlaneDown2_16(int src_width, int src_height,

+                               int dst_width, int dst_height,

+                               int src_stride, int dst_stride,

+                               const uint16* src_ptr, uint16* dst_ptr,

+                               enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,

+                        uint16* dst_ptr, int dst_width) =

+    filtering == kFilterNone ? ScaleRowDown2_16_C :

+        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :

+        ScaleRowDown2Box_16_C);

+  int row_stride = src_stride << 1;

+  if (!filtering) {

+    src_ptr += src_stride;  // Point to odd rows.

+    src_stride = 0;

+  }

+#if defined(HAS_SCALEROWDOWN2_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {

+    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :

+        ScaleRowDown2_16_NEON;

+  }

+#elif defined(HAS_SCALEROWDOWN2_16_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {

+    ScaleRowDown2 = filtering == kFilterNone ?

+        ScaleRowDown2_Unaligned_16_SSE2 :

+        (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_16_SSE2 :

+        ScaleRowDown2Box_Unaligned_16_SSE2);

+    if (IS_ALIGNED(src_ptr, 16) &&

+        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) &&

+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :

+          (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :

+          ScaleRowDown2Box_16_SSE2);

+    }

+  }

+#elif defined(HAS_SCALEROWDOWN2_16_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) &&

+      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    ScaleRowDown2 = filtering ?

+        ScaleRowDown2Box_16_MIPS_DSPR2 : ScaleRowDown2_16_MIPS_DSPR2;

+  }

+#endif

+  if (filtering == kFilterLinear) {

+    src_stride = 0;

+  }

+  // TODO(fbarchard): Loop through source height to allow odd height.

+  for (y = 0; y < dst_height; ++y) {

+    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);

+    src_ptr += row_stride;

+    dst_ptr += dst_stride;

+  }

+}

+// Scale plane, 1/4

+// This is an optimized version for scaling down a plane to 1/4 of

+// its original size.

+static void ScalePlaneDown4(int src_width, int src_height,

+                            int dst_width, int dst_height,

+                            int src_stride, int dst_stride,

+                            const uint8* src_ptr, uint8* dst_ptr,

+                            enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) =

+      filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;

+  int row_stride = src_stride << 2;

+  if (!filtering) {

+    src_ptr += src_stride * 2;  // Point to row 2.

+    src_stride = 0;

+  }

+#if defined(HAS_SCALEROWDOWN4_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {

+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;

+  }

+#elif defined(HAS_SCALEROWDOWN4_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) &&

+      IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&

+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2;

+  }

+#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&

+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    ScaleRowDown4 = filtering ?

+        ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;

+  }

+#endif

+  if (filtering == kFilterLinear) {

+    src_stride = 0;

+  }

+  for (y = 0; y < dst_height; ++y) {

+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);

+    src_ptr += row_stride;

+    dst_ptr += dst_stride;

+  }

+}

+static void ScalePlaneDown4_16(int src_width, int src_height,

+                               int dst_width, int dst_height,

+                               int src_stride, int dst_stride,

+                               const uint16* src_ptr, uint16* dst_ptr,

+                               enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,

+                        uint16* dst_ptr, int dst_width) =

+      filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;

+  int row_stride = src_stride << 2;

+  if (!filtering) {

+    src_ptr += src_stride * 2;  // Point to row 2.

+    src_stride = 0;

+  }

+#if defined(HAS_SCALEROWDOWN4_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {

+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :

+        ScaleRowDown4_16_NEON;

+  }

+#elif defined(HAS_SCALEROWDOWN4_16_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) &&

+      IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) &&

+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :

+        ScaleRowDown4_16_SSE2;

+  }

+#elif defined(HAS_SCALEROWDOWN4_16_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) &&

+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    ScaleRowDown4 = filtering ?

+        ScaleRowDown4Box_16_MIPS_DSPR2 : ScaleRowDown4_16_MIPS_DSPR2;

+  }

+#endif

+  if (filtering == kFilterLinear) {

+    src_stride = 0;

+  }

+  for (y = 0; y < dst_height; ++y) {

+    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);

+    src_ptr += row_stride;

+    dst_ptr += dst_stride;

+  }

+}

+// Scale plane down, 3/4

+static void ScalePlaneDown34(int src_width, int src_height,

+                             int dst_width, int dst_height,

+                             int src_stride, int dst_stride,

+                             const uint8* src_ptr, uint8* dst_ptr,

+                             enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width);

+  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width);

+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

+  assert(dst_width % 3 == 0);

+  if (!filtering) {

+    ScaleRowDown34_0 = ScaleRowDown34_C;

+    ScaleRowDown34_1 = ScaleRowDown34_C;

+  } else {

+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_C;

+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_C;

+  }

+#if defined(HAS_SCALEROWDOWN34_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {

+    if (!filtering) {

+      ScaleRowDown34_0 = ScaleRowDown34_NEON;

+      ScaleRowDown34_1 = ScaleRowDown34_NEON;

+    } else {

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON;

+    }

+  }

+#endif

+#if defined(HAS_SCALEROWDOWN34_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&

+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+    if (!filtering) {

+      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;

+      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;

+    } else {

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&

+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    if (!filtering) {

+      ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;

+    } else {

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2;

+    }

+  }

+#endif

+  for (y = 0; y < dst_height - 2; y += 3) {

+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,

+                     dst_ptr, dst_width);

+    src_ptr += src_stride * 2;

+    dst_ptr += dst_stride;

+  }

+  // Remainder 1 or 2 rows with last row vertically unfiltered

+  if ((dst_height % 3) == 2) {

+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);

+  } else if ((dst_height % 3) == 1) {

+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);

+  }

+}

+static void ScalePlaneDown34_16(int src_width, int src_height,

+                                int dst_width, int dst_height,

+                                int src_stride, int dst_stride,

+                                const uint16* src_ptr, uint16* dst_ptr,

+                                enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst_ptr, int dst_width);

+  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst_ptr, int dst_width);

+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

+  assert(dst_width % 3 == 0);

+  if (!filtering) {

+    ScaleRowDown34_0 = ScaleRowDown34_16_C;

+    ScaleRowDown34_1 = ScaleRowDown34_16_C;

+  } else {

+    ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_C;

+    ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_C;

+  }

+#if defined(HAS_SCALEROWDOWN34_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {

+    if (!filtering) {

+      ScaleRowDown34_0 = ScaleRowDown34_16_NEON;

+      ScaleRowDown34_1 = ScaleRowDown34_16_NEON;

+    } else {

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_NEON;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_NEON;

+    }

+  }

+#endif

+#if defined(HAS_SCALEROWDOWN34_16_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&

+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+    if (!filtering) {

+      ScaleRowDown34_0 = ScaleRowDown34_16_SSSE3;

+      ScaleRowDown34_1 = ScaleRowDown34_16_SSSE3;

+    } else {

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_SSSE3;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_SCALEROWDOWN34_16_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&

+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    if (!filtering) {

+      ScaleRowDown34_0 = ScaleRowDown34_16_MIPS_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_16_MIPS_DSPR2;

+    } else {

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_MIPS_DSPR2;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_MIPS_DSPR2;

+    }

+  }

+#endif

+  for (y = 0; y < dst_height - 2; y += 3) {

+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+    ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,

+                     dst_ptr, dst_width);

+    src_ptr += src_stride * 2;

+    dst_ptr += dst_stride;

+  }

+  // Remainder 1 or 2 rows with last row vertically unfiltered

+  if ((dst_height % 3) == 2) {

+    ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride;

+    dst_ptr += dst_stride;

+    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);

+  } else if ((dst_height % 3) == 1) {

+    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);

+  }

+}

+// Scale plane, 3/8

+// This is an optimized version for scaling down a plane to 3/8

+// of its original size.

+//

+// Uses box filter arranges like this

+// aaabbbcc -> abc

+// aaabbbcc    def

+// aaabbbcc    ghi

+// dddeeeff

+// dddeeeff

+// dddeeeff

+// ggghhhii

+// ggghhhii

+// Boxes are 3x3, 2x3, 3x2 and 2x2

+static void ScalePlaneDown38(int src_width, int src_height,

+                             int dst_width, int dst_height,

+                             int src_stride, int dst_stride,

+                             const uint8* src_ptr, uint8* dst_ptr,

+                             enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width);

+  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width);

+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

+  assert(dst_width % 3 == 0);

+  if (!filtering) {

+    ScaleRowDown38_3 = ScaleRowDown38_C;

+    ScaleRowDown38_2 = ScaleRowDown38_C;

+  } else {

+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_C;

+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_C;

+  }

+#if defined(HAS_SCALEROWDOWN38_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {

+    if (!filtering) {

+      ScaleRowDown38_3 = ScaleRowDown38_NEON;

+      ScaleRowDown38_2 = ScaleRowDown38_NEON;

+    } else {

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON;

+    }

+  }

+#elif defined(HAS_SCALEROWDOWN38_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&

+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+    if (!filtering) {

+      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;

+      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;

+    } else {

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3;

+    }

+  }

+#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&

+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    if (!filtering) {

+      ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;

+    } else {

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2;

+    }

+  }

+#endif

+  for (y = 0; y < dst_height - 2; y += 3) {

+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 3;

+    dst_ptr += dst_stride;

+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 3;

+    dst_ptr += dst_stride;

+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 2;

+    dst_ptr += dst_stride;

+  }

+  // Remainder 1 or 2 rows with last row vertically unfiltered

+  if ((dst_height % 3) == 2) {

+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 3;

+    dst_ptr += dst_stride;

+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);

+  } else if ((dst_height % 3) == 1) {

+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);

+  }

+}

+static void ScalePlaneDown38_16(int src_width, int src_height,

+                                int dst_width, int dst_height,

+                                int src_stride, int dst_stride,

+                                const uint16* src_ptr, uint16* dst_ptr,

+                                enum FilterMode filtering) {

+  int y;

+  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst_ptr, int dst_width);

+  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst_ptr, int dst_width);

+  const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

+  assert(dst_width % 3 == 0);

+  if (!filtering) {

+    ScaleRowDown38_3 = ScaleRowDown38_16_C;

+    ScaleRowDown38_2 = ScaleRowDown38_16_C;

+  } else {

+    ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_C;

+    ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_C;

+  }

+#if defined(HAS_SCALEROWDOWN38_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {

+    if (!filtering) {

+      ScaleRowDown38_3 = ScaleRowDown38_16_NEON;

+      ScaleRowDown38_2 = ScaleRowDown38_16_NEON;

+    } else {

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_NEON;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_NEON;

+    }

+  }

+#elif defined(HAS_SCALEROWDOWN38_16_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&

+      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+    if (!filtering) {

+      ScaleRowDown38_3 = ScaleRowDown38_16_SSSE3;

+      ScaleRowDown38_2 = ScaleRowDown38_16_SSSE3;

+    } else {

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_SSSE3;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_SSSE3;

+    }

+  }

+#elif defined(HAS_SCALEROWDOWN38_16_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&

+      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+    if (!filtering) {

+      ScaleRowDown38_3 = ScaleRowDown38_16_MIPS_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_16_MIPS_DSPR2;

+    } else {

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_MIPS_DSPR2;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_MIPS_DSPR2;

+    }

+  }

+#endif

+  for (y = 0; y < dst_height - 2; y += 3) {

+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 3;

+    dst_ptr += dst_stride;

+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 3;

+    dst_ptr += dst_stride;

+    ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 2;

+    dst_ptr += dst_stride;

+  }

+  // Remainder 1 or 2 rows with last row vertically unfiltered

+  if ((dst_height % 3) == 2) {

+    ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

+    src_ptr += src_stride * 3;

+    dst_ptr += dst_stride;

+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);

+  } else if ((dst_height % 3) == 1) {

+    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);

+  }

+}

+static __inline uint32 SumBox(int iboxwidth, int iboxheight,

+                              ptrdiff_t src_stride, const uint8* src_ptr) {

+  uint32 sum = 0u;

+  int y;

+  assert(iboxwidth > 0);

+  assert(iboxheight > 0);

+  for (y = 0; y < iboxheight; ++y) {

+    int x;

+    for (x = 0; x < iboxwidth; ++x) {

+      sum += src_ptr[x];

+    }

+    src_ptr += src_stride;

+  }

+  return sum;

+}

+static __inline uint32 SumBox_16(int iboxwidth, int iboxheight,

+                                 ptrdiff_t src_stride, const uint16* src_ptr) {

+  uint32 sum = 0u;

+  int y;

+  assert(iboxwidth > 0);

+  assert(iboxheight > 0);

+  for (y = 0; y < iboxheight; ++y) {

+    int x;

+    for (x = 0; x < iboxwidth; ++x) {

+      sum += src_ptr[x];

+    }

+    src_ptr += src_stride;

+  }

+  return sum;

+}

+static void ScalePlaneBoxRow_C(int dst_width, int boxheight,

+                               int x, int dx, ptrdiff_t src_stride,

+                               const uint8* src_ptr, uint8* dst_ptr) {

+  int i;

+  int boxwidth;

+  for (i = 0; i < dst_width; ++i) {

+    int ix = x >> 16;

+    x += dx;

+    boxwidth = (x >> 16) - ix;

+    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /

+        (boxwidth * boxheight);

+  }

+}

+static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight,

+                                  int x, int dx, ptrdiff_t src_stride,

+                                  const uint16* src_ptr, uint16* dst_ptr) {

+  int i;

+  int boxwidth;

+  for (i = 0; i < dst_width; ++i) {

+    int ix = x >> 16;

+    x += dx;

+    boxwidth = (x >> 16) - ix;

+    *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) /

+        (boxwidth * boxheight);

+  }

+}

+static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {

+  uint32 sum = 0u;

+  int x;

+  assert(iboxwidth > 0);

+  for (x = 0; x < iboxwidth; ++x) {

+    sum += src_ptr[x];

+  }

+  return sum;

+}

+static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {

+  uint32 sum = 0u;

+  int x;

+  assert(iboxwidth > 0);

+  for (x = 0; x < iboxwidth; ++x) {

+    sum += src_ptr[x];

+  }

+  return sum;

+}

+static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,

+                            const uint16* src_ptr, uint8* dst_ptr) {

+  int i;

+  int scaletbl[2];

+  int minboxwidth = (dx >> 16);

+  int* scaleptr = scaletbl - minboxwidth;

+  int boxwidth;

+  scaletbl[0] = 65536 / (minboxwidth * boxheight);

+  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);

+  for (i = 0; i < dst_width; ++i) {

+    int ix = x >> 16;

+    x += dx;

+    boxwidth = (x >> 16) - ix;

+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;

+  }

+}

+static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,

+                               const uint32* src_ptr, uint16* dst_ptr) {

+  int i;

+  int scaletbl[2];

+  int minboxwidth = (dx >> 16);

+  int* scaleptr = scaletbl - minboxwidth;

+  int boxwidth;

+  scaletbl[0] = 65536 / (minboxwidth * boxheight);

+  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);

+  for (i = 0; i < dst_width; ++i) {

+    int ix = x >> 16;

+    x += dx;

+    boxwidth = (x >> 16) - ix;

+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *

+        scaleptr[boxwidth] >> 16;

+  }

+}

+static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,

+                            const uint16* src_ptr, uint8* dst_ptr) {

+  int boxwidth = (dx >> 16);

+  int scaleval = 65536 / (boxwidth * boxheight);

+  int i;

+  for (i = 0; i < dst_width; ++i) {

+    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;

+    x += boxwidth;

+  }

+}

+static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,

+                               const uint32* src_ptr, uint16* dst_ptr) {

+  int boxwidth = (dx >> 16);

+  int scaleval = 65536 / (boxwidth * boxheight);

+  int i;

+  for (i = 0; i < dst_width; ++i) {

+    *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + x) * scaleval >> 16;

+    x += boxwidth;

+  }

+}

+// Scale plane down to any dimensions, with interpolation.

+// (boxfilter).

+//

+// Same method as SimpleScale, which is fixed point, outputting

+// one pixel of destination using fixed point (16.16) to step

+// through source, sampling a box of pixel with simple

+// averaging.

+static void ScalePlaneBox(int src_width, int src_height,

+                          int dst_width, int dst_height,

+                          int src_stride, int dst_stride,

+                          const uint8* src_ptr, uint8* dst_ptr) {

+  int j;

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  const int max_y = (src_height << 16);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+  // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.

+  if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {

+    uint8* dst = dst_ptr;

+    int j;

+    for (j = 0; j < dst_height; ++j) {

+      int boxheight;

+      int iy = y >> 16;

+      const uint8* src = src_ptr + iy * src_stride;

+      y += dy;

+      if (y > max_y) {

+        y = max_y;

+      }

+      boxheight = (y >> 16) - iy;

+      ScalePlaneBoxRow_C(dst_width, boxheight,

+                         x, dx, src_stride,

+                         src, dst);

+      dst += dst_stride;

+    }

+    return;

+  }

+  {

+    // Allocate a row buffer of uint16.

+    align_buffer_64(row16, src_width * 2);

+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,

+        const uint16* src_ptr, uint8* dst_ptr) =

+        (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C;

+    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,

+        uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C;

+#if defined(HAS_SCALEADDROWS_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) &&

+#ifdef AVOID_OVERREAD

+        IS_ALIGNED(src_width, 16) &&

+#endif

+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+      ScaleAddRows = ScaleAddRows_SSE2;

+    }

+#endif

+    for (j = 0; j < dst_height; ++j) {

+      int boxheight;

+      int iy = y >> 16;

+      const uint8* src = src_ptr + iy * src_stride;

+      y += dy;

+      if (y > (src_height << 16)) {

+        y = (src_height << 16);

+      }

+      boxheight = (y >> 16) - iy;

+      ScaleAddRows(src, src_stride, (uint16*)(row16),

+                 src_width, boxheight);

+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16),

+                 dst_ptr);

+      dst_ptr += dst_stride;

+    }

+    free_aligned_buffer_64(row16);

+  }

+}

+static void ScalePlaneBox_16(int src_width, int src_height,

+                             int dst_width, int dst_height,

+                             int src_stride, int dst_stride,

+                             const uint16* src_ptr, uint16* dst_ptr) {

+  int j;

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  const int max_y = (src_height << 16);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+  // TODO(fbarchard): Remove this and make AddRows handle boxheight 1.

+  if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) {

+    uint16* dst = dst_ptr;

+    int j;

+    for (j = 0; j < dst_height; ++j) {

+      int boxheight;

+      int iy = y >> 16;

+      const uint16* src = src_ptr + iy * src_stride;

+      y += dy;

+      if (y > max_y) {

+        y = max_y;

+      }

+      boxheight = (y >> 16) - iy;

+      ScalePlaneBoxRow_16_C(dst_width, boxheight,

+                            x, dx, src_stride,

+                            src, dst);

+      dst += dst_stride;

+    }

+    return;

+  }

+  {

+    // Allocate a row buffer of uint32.

+    align_buffer_64(row32, src_width * 4);

+    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,

+        const uint32* src_ptr, uint16* dst_ptr) =

+        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;

+    void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride,

+        uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C;

+#if defined(HAS_SCALEADDROWS_16_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) &&

+#ifdef AVOID_OVERREAD

+        IS_ALIGNED(src_width, 16) &&

+#endif

+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+      ScaleAddRows = ScaleAddRows_16_SSE2;

+    }

+#endif

+    for (j = 0; j < dst_height; ++j) {

+      int boxheight;

+      int iy = y >> 16;

+      const uint16* src = src_ptr + iy * src_stride;

+      y += dy;

+      if (y > (src_height << 16)) {

+        y = (src_height << 16);

+      }

+      boxheight = (y >> 16) - iy;

+      ScaleAddRows(src, src_stride, (uint32*)(row32),

+                 src_width, boxheight);

+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32),

+                 dst_ptr);

+      dst_ptr += dst_stride;

+    }

+    free_aligned_buffer_64(row32);

+  }

+}

+// Scale plane down with bilinear interpolation.

+void ScalePlaneBilinearDown(int src_width, int src_height,

+                            int dst_width, int dst_height,

+                            int src_stride, int dst_stride,

+                            const uint8* src_ptr, uint8* dst_ptr,

+                            enum FilterMode filtering) {

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.

+  // Allocate a row buffer.

+  align_buffer_64(row, src_width);

+  const int max_y = (src_height - 1) << 16;

+  int j;

+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,

+      int dst_width, int x, int dx) =

+      (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;

+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_C;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSE2;

+    if (IS_ALIGNED(src_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSSE3;

+    if (IS_ALIGNED(src_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {

+    InterpolateRow = InterpolateRow_Any_AVX2;

+    if (IS_ALIGNED(src_width, 32)) {

+      InterpolateRow = InterpolateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_NEON;

+    if (IS_ALIGNED(src_width, 16)) {

+      InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+    if (IS_ALIGNED(src_width, 4)) {

+      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+    }

+  }

+#endif

+#if defined(HAS_SCALEFILTERCOLS_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

+    ScaleFilterCols = ScaleFilterCols_SSSE3;

+  }

+#endif

+  if (y > max_y) {

+    y = max_y;

+  }

+  for (j = 0; j < dst_height; ++j) {

+    int yi = y >> 16;

+    const uint8* src = src_ptr + yi * src_stride;

+    if (filtering == kFilterLinear) {

+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);

+    } else {

+      int yf = (y >> 8) & 255;

+      InterpolateRow(row, src, src_stride, src_width, yf);

+      ScaleFilterCols(dst_ptr, row, dst_width, x, dx);

+    }

+    dst_ptr += dst_stride;

+    y += dy;

+    if (y > max_y) {

+      y = max_y;

+    }

+  }

+  free_aligned_buffer_64(row);

+}

+void ScalePlaneBilinearDown_16(int src_width, int src_height,

+                               int dst_width, int dst_height,

+                               int src_stride, int dst_stride,

+                               const uint16* src_ptr, uint16* dst_ptr,

+                               enum FilterMode filtering) {

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.

+  // Allocate a row buffer.

+  align_buffer_64(row, src_width * 2);

+  const int max_y = (src_height - 1) << 16;

+  int j;

+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,

+      int dst_width, int x, int dx) =

+      (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;

+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_16_C;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_16_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_SSE2;

+    if (IS_ALIGNED(src_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;

+      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+        InterpolateRow = InterpolateRow_16_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_SSSE3;

+    if (IS_ALIGNED(src_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;

+      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {

+        InterpolateRow = InterpolateRow_16_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) {

+    InterpolateRow = InterpolateRow_Any_16_AVX2;

+    if (IS_ALIGNED(src_width, 32)) {

+      InterpolateRow = InterpolateRow_16_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_NEON;

+    if (IS_ALIGNED(src_width, 16)) {

+      InterpolateRow = InterpolateRow_16_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;

+    if (IS_ALIGNED(src_width, 4)) {

+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;

+    }

+  }

+#endif

+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;

+  }

+#endif

+  if (y > max_y) {

+    y = max_y;

+  }

+  for (j = 0; j < dst_height; ++j) {

+    int yi = y >> 16;

+    const uint16* src = src_ptr + yi * src_stride;

+    if (filtering == kFilterLinear) {

+      ScaleFilterCols(dst_ptr, src, dst_width, x, dx);

+    } else {

+      int yf = (y >> 8) & 255;

+      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);

+      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);

+    }

+    dst_ptr += dst_stride;

+    y += dy;

+    if (y > max_y) {

+      y = max_y;

+    }

+  }

+  free_aligned_buffer_64(row);

+}

+// Scale up down with bilinear interpolation.

+void ScalePlaneBilinearUp(int src_width, int src_height,

+                          int dst_width, int dst_height,

+                          int src_stride, int dst_stride,

+                          const uint8* src_ptr, uint8* dst_ptr,

+                          enum FilterMode filtering) {

+  int j;

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  const int max_y = (src_height - 1) << 16;

+  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_C;

+  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,

+       int dst_width, int x, int dx) =

+       filtering ? ScaleFilterCols_C : ScaleCols_C;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSE2;

+    if (IS_ALIGNED(dst_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSE2;

+      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSSE3;

+    if (IS_ALIGNED(dst_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {

+    InterpolateRow = InterpolateRow_Any_AVX2;

+    if (IS_ALIGNED(dst_width, 32)) {

+      InterpolateRow = InterpolateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_NEON;

+    if (IS_ALIGNED(dst_width, 16)) {

+      InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+    }

+  }

+#endif

+  if (filtering && src_width >= 32768) {

+    ScaleFilterCols = ScaleFilterCols64_C;

+  }

+#if defined(HAS_SCALEFILTERCOLS_SSSE3)

+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

+    ScaleFilterCols = ScaleFilterCols_SSSE3;

+  }

+#endif

+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {

+    ScaleFilterCols = ScaleColsUp2_C;

+#if defined(HAS_SCALECOLS_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&

+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleFilterCols = ScaleColsUp2_SSE2;

+    }

+#endif

+  }

+  if (y > max_y) {

+    y = max_y;

+  }

+  {

+    int yi = y >> 16;

+    const uint8* src = src_ptr + yi * src_stride;

+    // Allocate 2 row buffers.

+    const int kRowSize = (dst_width + 15) & ~15;

+    align_buffer_64(row, kRowSize * 2);

+    uint8* rowptr = row;

+    int rowstride = kRowSize;

+    int lasty = yi;

+    ScaleFilterCols(rowptr, src, dst_width, x, dx);

+    if (src_height > 1) {

+      src += src_stride;

+    }

+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);

+    src += src_stride;

+    for (j = 0; j < dst_height; ++j) {

+      yi = y >> 16;

+      if (yi != lasty) {

+        if (y > max_y) {

+          y = max_y;

+          yi = y >> 16;

+          src = src_ptr + yi * src_stride;

+        }

+        if (yi != lasty) {

+          ScaleFilterCols(rowptr, src, dst_width, x, dx);

+          rowptr += rowstride;

+          rowstride = -rowstride;

+          lasty = yi;

+          src += src_stride;

+        }

+      }

+      if (filtering == kFilterLinear) {

+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);

+      } else {

+        int yf = (y >> 8) & 255;

+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);

+      }

+      dst_ptr += dst_stride;

+      y += dy;

+    }

+    free_aligned_buffer_64(row);

+  }

+}

+void ScalePlaneBilinearUp_16(int src_width, int src_height,

+                             int dst_width, int dst_height,

+                             int src_stride, int dst_stride,

+                             const uint16* src_ptr, uint16* dst_ptr,

+                             enum FilterMode filtering) {

+  int j;

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  const int max_y = (src_height - 1) << 16;

+  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_16_C;

+  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,

+       int dst_width, int x, int dx) =

+       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+#if defined(HAS_INTERPOLATEROW_16_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_SSE2;

+    if (IS_ALIGNED(dst_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;

+      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_16_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_SSSE3;

+    if (IS_ALIGNED(dst_width, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;

+      if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_16_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) {

+    InterpolateRow = InterpolateRow_Any_16_AVX2;

+    if (IS_ALIGNED(dst_width, 32)) {

+      InterpolateRow = InterpolateRow_16_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_NEON;

+    if (IS_ALIGNED(dst_width, 16)) {

+      InterpolateRow = InterpolateRow_16_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) {

+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;

+    if (IS_ALIGNED(dst_width, 4)) {

+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;

+    }

+  }

+#endif

+  if (filtering && src_width >= 32768) {

+    ScaleFilterCols = ScaleFilterCols64_16_C;

+  }

+#if defined(HAS_SCALEFILTERCOLS_16_SSSE3)

+  if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

+    ScaleFilterCols = ScaleFilterCols_16_SSSE3;

+  }

+#endif

+  if (!filtering && src_width * 2 == dst_width && x < 0x8000) {

+    ScaleFilterCols = ScaleColsUp2_16_C;

+#if defined(HAS_SCALECOLS_16_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&

+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleFilterCols = ScaleColsUp2_16_SSE2;

+    }

+#endif

+  }

+  if (y > max_y) {

+    y = max_y;

+  }

+  {

+    int yi = y >> 16;

+    const uint16* src = src_ptr + yi * src_stride;

+    // Allocate 2 row buffers.

+    const int kRowSize = (dst_width + 15) & ~15;

+    align_buffer_64(row, kRowSize * 4);

+    uint16* rowptr = (uint16*)row;

+    int rowstride = kRowSize;

+    int lasty = yi;

+    ScaleFilterCols(rowptr, src, dst_width, x, dx);

+    if (src_height > 1) {

+      src += src_stride;

+    }

+    ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx);

+    src += src_stride;

+    for (j = 0; j < dst_height; ++j) {

+      yi = y >> 16;

+      if (yi != lasty) {

+        if (y > max_y) {

+          y = max_y;

+          yi = y >> 16;

+          src = src_ptr + yi * src_stride;

+        }

+        if (yi != lasty) {

+          ScaleFilterCols(rowptr, src, dst_width, x, dx);

+          rowptr += rowstride;

+          rowstride = -rowstride;

+          lasty = yi;

+          src += src_stride;

+        }

+      }

+      if (filtering == kFilterLinear) {

+        InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0);

+      } else {

+        int yf = (y >> 8) & 255;

+        InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf);

+      }

+      dst_ptr += dst_stride;

+      y += dy;

+    }

+    free_aligned_buffer_64(row);

+  }

+}

+// Scale Plane to/from any dimensions, without interpolation.

+// Fixed point math is used for performance: The upper 16 bits

+// of x and dx is the integer part of the source position and

+// the lower 16 bits are the fixed decimal part.

+static void ScalePlaneSimple(int src_width, int src_height,

+                             int dst_width, int dst_height,

+                             int src_stride, int dst_stride,

+                             const uint8* src_ptr, uint8* dst_ptr) {

+  int i;

+  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,

+      int dst_width, int x, int dx) = ScaleCols_C;

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+  if (src_width * 2 == dst_width && x < 0x8000) {

+    ScaleCols = ScaleColsUp2_C;

+#if defined(HAS_SCALECOLS_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&

+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleCols = ScaleColsUp2_SSE2;

+    }

+#endif

+  }

+  for (i = 0; i < dst_height; ++i) {

+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,

+              dst_width, x, dx);

+    dst_ptr += dst_stride;

+    y += dy;

+  }

+}

+static void ScalePlaneSimple_16(int src_width, int src_height,

+                                int dst_width, int dst_height,

+                                int src_stride, int dst_stride,

+                                const uint16* src_ptr, uint16* dst_ptr) {

+  int i;

+  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,

+      int dst_width, int x, int dx) = ScaleCols_16_C;

+  // Initial source x/y coordinate and step values as 16.16 fixed point.

+  int x = 0;

+  int y = 0;

+  int dx = 0;

+  int dy = 0;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,

+             &x, &y, &dx, &dy);

+  src_width = Abs(src_width);

+  if (src_width * 2 == dst_width && x < 0x8000) {

+    ScaleCols = ScaleColsUp2_16_C;

+#if defined(HAS_SCALECOLS_16_SSE2)

+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&

+        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&

+        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {

+      ScaleCols = ScaleColsUp2_16_SSE2;

+    }

+#endif

+  }

+  for (i = 0; i < dst_height; ++i) {

+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,

+              dst_width, x, dx);

+    dst_ptr += dst_stride;

+    y += dy;

+  }

+}

+// Scale a plane.

+// This function dispatches to a specialized scaler based on scale factor.

+LIBYUV_API

+void ScalePlane(const uint8* src, int src_stride,

+                int src_width, int src_height,

+                uint8* dst, int dst_stride,

+                int dst_width, int dst_height,

+                enum FilterMode filtering) {

+  // Simplify filtering when possible.

+  filtering = ScaleFilterReduce(src_width, src_height,

+                                dst_width, dst_height,

+                                filtering);

+  // Negative height means invert the image.

+  if (src_height < 0) {

+    src_height = -src_height;

+    src = src + (src_height - 1) * src_stride;

+    src_stride = -src_stride;

+  }

+  // Use specialized scales to improve performance for common resolutions.

+  // For example, all the 1/2 scalings will use ScalePlaneDown2()

+  if (dst_width == src_width && dst_height == src_height) {

+    // Straight copy.

+    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);

+    return;

+  }

+  if (dst_width == src_width) {

+    int dy = FixedDiv(src_height, dst_height);

+    // Arbitrary scale vertically, but unscaled vertically.

+    ScalePlaneVertical(src_height,

+                       dst_width, dst_height,

+                       src_stride, dst_stride, src, dst,

+                       0, 0, dy, 1, filtering);

+    return;

+  }

+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {

+    // Scale down.

+    if (4 * dst_width == 3 * src_width &&

+        4 * dst_height == 3 * src_height) {

+      // optimized, 3/4

+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,

+                       src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {

+      // optimized, 1/2

+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,

+                      src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+    // 3/8 rounded up for odd sized chroma height.

+    if (8 * dst_width == 3 * src_width &&

+        dst_height == ((src_height * 3 + 7) / 8)) {

+      // optimized, 3/8

+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,

+                       src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&

+               filtering != kFilterBilinear) {

+      // optimized, 1/4

+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,

+                      src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+  }

+  if (filtering == kFilterBox && dst_height * 2 < src_height) {

+    ScalePlaneBox(src_width, src_height, dst_width, dst_height,

+                  src_stride, dst_stride, src, dst);

+    return;

+  }

+  if (filtering && dst_height > src_height) {

+    ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,

+                         src_stride, dst_stride, src, dst, filtering);

+    return;

+  }

+  if (filtering) {

+    ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height,

+                           src_stride, dst_stride, src, dst, filtering);

+    return;

+  }

+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,

+                   src_stride, dst_stride, src, dst);

+}

+LIBYUV_API

+void ScalePlane_16(const uint16* src, int src_stride,

+                  int src_width, int src_height,

+                  uint16* dst, int dst_stride,

+                  int dst_width, int dst_height,

+                  enum FilterMode filtering) {

+  // Simplify filtering when possible.

+  filtering = ScaleFilterReduce(src_width, src_height,

+                                dst_width, dst_height,

+                                filtering);

+  // Negative height means invert the image.

+  if (src_height < 0) {

+    src_height = -src_height;

+    src = src + (src_height - 1) * src_stride;

+    src_stride = -src_stride;

+  }

+  // Use specialized scales to improve performance for common resolutions.

+  // For example, all the 1/2 scalings will use ScalePlaneDown2()

+  if (dst_width == src_width && dst_height == src_height) {

+    // Straight copy.

+    CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);

+    return;

+  }

+  if (dst_width == src_width) {

+    int dy = FixedDiv(src_height, dst_height);

+    // Arbitrary scale vertically, but unscaled vertically.

+    ScalePlaneVertical_16(src_height,

+                          dst_width, dst_height,

+                          src_stride, dst_stride, src, dst,

+                          0, 0, dy, 1, filtering);

+    return;

+  }

+  if (dst_width <= Abs(src_width) && dst_height <= src_height) {

+    // Scale down.

+    if (4 * dst_width == 3 * src_width &&

+        4 * dst_height == 3 * src_height) {

+      // optimized, 3/4

+      ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,

+                          src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+    if (2 * dst_width == src_width && 2 * dst_height == src_height) {

+      // optimized, 1/2

+      ScalePlaneDown2_16(src_width, src_height, dst_width, dst_height,

+                         src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+    // 3/8 rounded up for odd sized chroma height.

+    if (8 * dst_width == 3 * src_width &&

+        dst_height == ((src_height * 3 + 7) / 8)) {

+      // optimized, 3/8

+      ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,

+                          src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+    if (4 * dst_width == src_width && 4 * dst_height == src_height &&

+               filtering != kFilterBilinear) {

+      // optimized, 1/4

+      ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,

+                         src_stride, dst_stride, src, dst, filtering);

+      return;

+    }

+  }

+  if (filtering == kFilterBox && dst_height * 2 < src_height) {

+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,

+                     src_stride, dst_stride, src, dst);

+    return;

+  }

+  if (filtering && dst_height > src_height) {

+    ScalePlaneBilinearUp_16(src_width, src_height, dst_width, dst_height,

+                            src_stride, dst_stride, src, dst, filtering);

+    return;

+  }

+  if (filtering) {

+    ScalePlaneBilinearDown_16(src_width, src_height, dst_width, dst_height,

+                              src_stride, dst_stride, src, dst, filtering);

+    return;

+  }

+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,

+                      src_stride, dst_stride, src, dst);

+}

+// Scale an I420 image.

+// This function in turn calls a scaling function for each plane.

+LIBYUV_API

+int I420Scale(const uint8* src_y, int src_stride_y,

+              const uint8* src_u, int src_stride_u,

+              const uint8* src_v, int src_stride_v,

+              int src_width, int src_height,

+              uint8* dst_y, int dst_stride_y,

+              uint8* dst_u, int dst_stride_u,

+              uint8* dst_v, int dst_stride_v,

+              int dst_width, int dst_height,

+              enum FilterMode filtering) {

+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);

+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);

+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);

+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||

+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {

+    return -1;

+  }

+  ScalePlane(src_y, src_stride_y, src_width, src_height,

+             dst_y, dst_stride_y, dst_width, dst_height,

+             filtering);

+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,

+             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

+             filtering);

+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,

+             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

+             filtering);

+  return 0;

+}

+LIBYUV_API

+int I420Scale_16(const uint16* src_y, int src_stride_y,

+                 const uint16* src_u, int src_stride_u,

+                 const uint16* src_v, int src_stride_v,

+                 int src_width, int src_height,

+                 uint16* dst_y, int dst_stride_y,

+                 uint16* dst_u, int dst_stride_u,

+                 uint16* dst_v, int dst_stride_v,

+                 int dst_width, int dst_height,

+                 enum FilterMode filtering) {

+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);

+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);

+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);

+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||

+      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {

+    return -1;

+  }

+  ScalePlane_16(src_y, src_stride_y, src_width, src_height,

+                dst_y, dst_stride_y, dst_width, dst_height,

+                filtering);

+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,

+                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

+                filtering);

+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,

+                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

+                filtering);

+  return 0;

+}

+// Deprecated api

+LIBYUV_API

+int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,

+          int src_stride_y, int src_stride_u, int src_stride_v,

+          int src_width, int src_height,

+          uint8* dst_y, uint8* dst_u, uint8* dst_v,

+          int dst_stride_y, int dst_stride_u, int dst_stride_v,

+          int dst_width, int dst_height,

+          LIBYUV_BOOL interpolate) {

+  return I420Scale(src_y, src_stride_y,

+                   src_u, src_stride_u,

+                   src_v, src_stride_v,

+                   src_width, src_height,

+                   dst_y, dst_stride_y,

+                   dst_u, dst_stride_u,

+                   dst_v, dst_stride_v,

+                   dst_width, dst_height,

+                   interpolate ? kFilterBox : kFilterNone);

+}

+// Deprecated api

+LIBYUV_API

+int ScaleOffset(const uint8* src, int src_width, int src_height,

+                uint8* dst, int dst_width, int dst_height, int dst_yoffset,

+                LIBYUV_BOOL interpolate) {

+  // Chroma requires offset to multiple of 2.

+  int dst_yoffset_even = dst_yoffset & ~1;

+  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);

+  int src_halfheight = SUBSAMPLE(src_height, 1, 1);

+  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

+  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);

+  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height

+  const uint8* src_y = src;

+  const uint8* src_u = src + src_width * src_height;

+  const uint8* src_v = src + src_width * src_height +

+                             src_halfwidth * src_halfheight;

+  uint8* dst_y = dst + dst_yoffset_even * dst_width;

+  uint8* dst_u = dst + dst_width * dst_height +

+                 (dst_yoffset_even >> 1) * dst_halfwidth;

+  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +

+                 (dst_yoffset_even >> 1) * dst_halfwidth;

+  if (!src || src_width <= 0 || src_height <= 0 ||

+      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||

+      dst_yoffset_even >= dst_height) {

+    return -1;

+  }

+  return I420Scale(src_y, src_width,

+                   src_u, src_halfwidth,

+                   src_v, src_halfwidth,

+                   src_width, src_height,

+                   dst_y, dst_width,

+                   dst_u, dst_halfwidth,

+                   dst_v, dst_halfwidth,

+                   dst_width, aheight,

+                   interpolate ? kFilterBox : kFilterNone);

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/scale_common.cc

@@ -1,0 +1,1165 @@

+/*

+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/scale.h"

+#include <assert.h>

+#include <string.h>

+#include "third_party/libyuv/include/libyuv/cpu_id.h"

+#include "third_party/libyuv/include/libyuv/planar_functions.h"  // CopyARGB

+#include "third_party/libyuv/include/libyuv/row.h"

+#include "third_party/libyuv/include/libyuv/scale_row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+static __inline int Abs(int v) {

+  return v >= 0 ? v : -v;

+}

+// CPU agnostic row functions

+void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                     uint8* dst, int dst_width) {

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = src_ptr[1];

+    dst[1] = src_ptr[3];

+    dst += 2;

+    src_ptr += 4;

+  }

+  if (dst_width & 1) {

+    dst[0] = src_ptr[1];

+  }

+}

+void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                        uint16* dst, int dst_width) {

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = src_ptr[1];

+    dst[1] = src_ptr[3];

+    dst += 2;

+    src_ptr += 4;

+  }

+  if (dst_width & 1) {

+    dst[0] = src_ptr[1];

+  }

+}

+void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst, int dst_width) {

+  const uint8* s = src_ptr;

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = (s[0] + s[1] + 1) >> 1;

+    dst[1] = (s[2] + s[3] + 1) >> 1;

+    dst += 2;

+    s += 4;

+  }

+  if (dst_width & 1) {

+    dst[0] = (s[0] + s[1] + 1) >> 1;

+  }

+}

+void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                              uint16* dst, int dst_width) {

+  const uint16* s = src_ptr;

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = (s[0] + s[1] + 1) >> 1;

+    dst[1] = (s[2] + s[3] + 1) >> 1;

+    dst += 2;

+    s += 4;

+  }

+  if (dst_width & 1) {

+    dst[0] = (s[0] + s[1] + 1) >> 1;

+  }

+}

+void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst, int dst_width) {

+  const uint8* s = src_ptr;

+  const uint8* t = src_ptr + src_stride;

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;

+    dst += 2;

+    s += 4;

+    t += 4;

+  }

+  if (dst_width & 1) {

+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

+  }

+}

+void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst, int dst_width) {

+  const uint16* s = src_ptr;

+  const uint16* t = src_ptr + src_stride;

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

+    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;

+    dst += 2;

+    s += 4;

+    t += 4;

+  }

+  if (dst_width & 1) {

+    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

+  }

+}

+void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                     uint8* dst, int dst_width) {

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = src_ptr[2];

+    dst[1] = src_ptr[6];

+    dst += 2;

+    src_ptr += 8;

+  }

+  if (dst_width & 1) {

+    dst[0] = src_ptr[2];

+  }

+}

+void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                        uint16* dst, int dst_width) {

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = src_ptr[2];

+    dst[1] = src_ptr[6];

+    dst += 2;

+    src_ptr += 8;

+  }

+  if (dst_width & 1) {

+    dst[0] = src_ptr[2];

+  }

+}

+void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst, int dst_width) {

+  intptr_t stride = src_stride;

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

+             src_ptr[stride + 0] + src_ptr[stride + 1] +

+             src_ptr[stride + 2] + src_ptr[stride + 3] +

+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

+             8) >> 4;

+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +

+             src_ptr[stride + 4] + src_ptr[stride + 5] +

+             src_ptr[stride + 6] + src_ptr[stride + 7] +

+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +

+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +

+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +

+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +

+             8) >> 4;

+    dst += 2;

+    src_ptr += 8;

+  }

+  if (dst_width & 1) {

+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

+             src_ptr[stride + 0] + src_ptr[stride + 1] +

+             src_ptr[stride + 2] + src_ptr[stride + 3] +

+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

+             8) >> 4;

+  }

+}

+void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                           uint16* dst, int dst_width) {

+  intptr_t stride = src_stride;

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

+             src_ptr[stride + 0] + src_ptr[stride + 1] +

+             src_ptr[stride + 2] + src_ptr[stride + 3] +

+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

+             8) >> 4;

+    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +

+             src_ptr[stride + 4] + src_ptr[stride + 5] +

+             src_ptr[stride + 6] + src_ptr[stride + 7] +

+             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +

+             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +

+             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +

+             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +

+             8) >> 4;

+    dst += 2;

+    src_ptr += 8;

+  }

+  if (dst_width & 1) {

+    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

+             src_ptr[stride + 0] + src_ptr[stride + 1] +

+             src_ptr[stride + 2] + src_ptr[stride + 3] +

+             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

+             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

+             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

+             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

+             8) >> 4;

+  }

+}

+void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                      uint8* dst, int dst_width) {

+  int x;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 3) {

+    dst[0] = src_ptr[0];

+    dst[1] = src_ptr[1];

+    dst[2] = src_ptr[3];

+    dst += 3;

+    src_ptr += 4;

+  }

+}

+void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                         uint16* dst, int dst_width) {

+  int x;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 3) {

+    dst[0] = src_ptr[0];

+    dst[1] = src_ptr[1];

+    dst[2] = src_ptr[3];

+    dst += 3;

+    src_ptr += 4;

+  }

+}

+// Filter rows 0 and 1 together, 3 : 1

+void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* d, int dst_width) {

+  const uint8* s = src_ptr;

+  const uint8* t = src_ptr + src_stride;

+  int x;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 3) {

+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    d[0] = (a0 * 3 + b0 + 2) >> 2;

+    d[1] = (a1 * 3 + b1 + 2) >> 2;

+    d[2] = (a2 * 3 + b2 + 2) >> 2;

+    d += 3;

+    s += 4;

+    t += 4;

+  }

+}

+void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                               uint16* d, int dst_width) {

+  const uint16* s = src_ptr;

+  const uint16* t = src_ptr + src_stride;

+  int x;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 3) {

+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    d[0] = (a0 * 3 + b0 + 2) >> 2;

+    d[1] = (a1 * 3 + b1 + 2) >> 2;

+    d[2] = (a2 * 3 + b2 + 2) >> 2;

+    d += 3;

+    s += 4;

+    t += 4;

+  }

+}

+// Filter rows 1 and 2 together, 1 : 1

+void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* d, int dst_width) {

+  const uint8* s = src_ptr;

+  const uint8* t = src_ptr + src_stride;

+  int x;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 3) {

+    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    d[0] = (a0 + b0 + 1) >> 1;

+    d[1] = (a1 + b1 + 1) >> 1;

+    d[2] = (a2 + b2 + 1) >> 1;

+    d += 3;

+    s += 4;

+    t += 4;

+  }

+}

+void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                               uint16* d, int dst_width) {

+  const uint16* s = src_ptr;

+  const uint16* t = src_ptr + src_stride;

+  int x;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 3) {

+    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    d[0] = (a0 + b0 + 1) >> 1;

+    d[1] = (a1 + b1 + 1) >> 1;

+    d[2] = (a2 + b2 + 1) >> 1;

+    d += 3;

+    s += 4;

+    t += 4;

+  }

+}

+// Scales a single row of pixels using point sampling.

+void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,

+                 int dst_width, int x, int dx) {

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    dst_ptr[0] = src_ptr[x >> 16];

+    x += dx;

+    dst_ptr[1] = src_ptr[x >> 16];

+    x += dx;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    dst_ptr[0] = src_ptr[x >> 16];

+  }

+}

+void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                    int dst_width, int x, int dx) {

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    dst_ptr[0] = src_ptr[x >> 16];

+    x += dx;

+    dst_ptr[1] = src_ptr[x >> 16];

+    x += dx;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    dst_ptr[0] = src_ptr[x >> 16];

+  }

+}

+// Scales a single row of pixels up by 2x using point sampling.

+void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,

+                    int dst_width, int x, int dx) {

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];

+    src_ptr += 1;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    dst_ptr[0] = src_ptr[0];

+  }

+}

+void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                       int dst_width, int x, int dx) {

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    dst_ptr[1] = dst_ptr[0] = src_ptr[0];

+    src_ptr += 1;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    dst_ptr[0] = src_ptr[0];

+  }

+}

+// (1-f)a + fb can be replaced with a + f(b-a)

+#define BLENDER(a, b, f) (uint8)((int)(a) + \

+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))

+void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,

+                       int dst_width, int x, int dx) {

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    int xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    xi = x >> 16;

+    a = src_ptr[xi];

+    b = src_ptr[xi + 1];

+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    int xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+  }

+}

+void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,

+                         int dst_width, int x32, int dx) {

+  int64 x = (int64)(x32);

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    int64 xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    xi = x >> 16;

+    a = src_ptr[xi];

+    b = src_ptr[xi + 1];

+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    int64 xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+  }

+}

+#undef BLENDER

+#define BLENDER(a, b, f) (uint16)((int)(a) + \

+    ((int)(f) * ((int)(b) - (int)(a)) >> 16))

+void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                       int dst_width, int x, int dx) {

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    int xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    xi = x >> 16;

+    a = src_ptr[xi];

+    b = src_ptr[xi + 1];

+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    int xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+  }

+}

+void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,

+                         int dst_width, int x32, int dx) {

+  int64 x = (int64)(x32);

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    int64 xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    xi = x >> 16;

+    a = src_ptr[xi];

+    b = src_ptr[xi + 1];

+    dst_ptr[1] = BLENDER(a, b, x & 0xffff);

+    x += dx;

+    dst_ptr += 2;

+  }

+  if (dst_width & 1) {

+    int64 xi = x >> 16;

+    int a = src_ptr[xi];

+    int b = src_ptr[xi + 1];

+    dst_ptr[0] = BLENDER(a, b, x & 0xffff);

+  }

+}

+#undef BLENDER

+void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                      uint8* dst, int dst_width) {

+  int x;

+  assert(dst_width % 3 == 0);

+  for (x = 0; x < dst_width; x += 3) {

+    dst[0] = src_ptr[0];

+    dst[1] = src_ptr[3];

+    dst[2] = src_ptr[6];

+    dst += 3;

+    src_ptr += 8;

+  }

+}

+void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                         uint16* dst, int dst_width) {

+  int x;

+  assert(dst_width % 3 == 0);

+  for (x = 0; x < dst_width; x += 3) {

+    dst[0] = src_ptr[0];

+    dst[1] = src_ptr[3];

+    dst[2] = src_ptr[6];

+    dst += 3;

+    src_ptr += 8;

+  }

+}

+// 8x3 -> 3x1

+void ScaleRowDown38_3_Box_C(const uint8* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width) {

+  intptr_t stride = src_stride;

+  int i;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (i = 0; i < dst_width; i += 3) {

+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

+        src_ptr[stride + 0] + src_ptr[stride + 1] +

+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +

+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *

+        (65536 / 9) >> 16;

+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

+        src_ptr[stride + 3] + src_ptr[stride + 4] +

+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +

+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *

+        (65536 / 9) >> 16;

+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

+        src_ptr[stride + 6] + src_ptr[stride + 7] +

+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *

+        (65536 / 6) >> 16;

+    src_ptr += 8;

+    dst_ptr += 3;

+  }

+}

+void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint16* dst_ptr, int dst_width) {

+  intptr_t stride = src_stride;

+  int i;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (i = 0; i < dst_width; i += 3) {

+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

+        src_ptr[stride + 0] + src_ptr[stride + 1] +

+        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +

+        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *

+        (65536 / 9) >> 16;

+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

+        src_ptr[stride + 3] + src_ptr[stride + 4] +

+        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +

+        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *

+        (65536 / 9) >> 16;

+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

+        src_ptr[stride + 6] + src_ptr[stride + 7] +

+        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *

+        (65536 / 6) >> 16;

+    src_ptr += 8;

+    dst_ptr += 3;

+  }

+}

+// 8x2 -> 3x1

+void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst_ptr, int dst_width) {

+  intptr_t stride = src_stride;

+  int i;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (i = 0; i < dst_width; i += 3) {

+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

+        src_ptr[stride + 0] + src_ptr[stride + 1] +

+        src_ptr[stride + 2]) * (65536 / 6) >> 16;

+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

+        src_ptr[stride + 3] + src_ptr[stride + 4] +

+        src_ptr[stride + 5]) * (65536 / 6) >> 16;

+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

+        src_ptr[stride + 6] + src_ptr[stride + 7]) *

+        (65536 / 4) >> 16;

+    src_ptr += 8;

+    dst_ptr += 3;

+  }

+}

+void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                               uint16* dst_ptr, int dst_width) {

+  intptr_t stride = src_stride;

+  int i;

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (i = 0; i < dst_width; i += 3) {

+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

+        src_ptr[stride + 0] + src_ptr[stride + 1] +

+        src_ptr[stride + 2]) * (65536 / 6) >> 16;

+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

+        src_ptr[stride + 3] + src_ptr[stride + 4] +

+        src_ptr[stride + 5]) * (65536 / 6) >> 16;

+    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

+        src_ptr[stride + 6] + src_ptr[stride + 7]) *

+        (65536 / 4) >> 16;

+    src_ptr += 8;

+    dst_ptr += 3;

+  }

+}

+void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,

+                    uint16* dst_ptr, int src_width, int src_height) {

+  int x;

+  assert(src_width > 0);

+  assert(src_height > 0);

+  for (x = 0; x < src_width; ++x) {

+    const uint8* s = src_ptr + x;

+    unsigned int sum = 0u;

+    int y;

+    for (y = 0; y < src_height; ++y) {

+      sum += s[0];

+      s += src_stride;

+    }

+    // TODO(fbarchard): Consider limitting height to 256 to avoid overflow.

+    dst_ptr[x] = sum < 65535u ? sum : 65535u;

+  }

+}

+void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

+                       uint32* dst_ptr, int src_width, int src_height) {

+  int x;

+  assert(src_width > 0);

+  assert(src_height > 0);

+  for (x = 0; x < src_width; ++x) {

+    const uint16* s = src_ptr + x;

+    unsigned int sum = 0u;

+    int y;

+    for (y = 0; y < src_height; ++y) {

+      sum += s[0];

+      s += src_stride;

+    }

+    // No risk of overflow here now

+    dst_ptr[x] = sum;

+  }

+}

+void ScaleARGBRowDown2_C(const uint8* src_argb,

+                         ptrdiff_t src_stride,

+                         uint8* dst_argb, int dst_width) {

+  const uint32* src = (const uint32*)(src_argb);

+  uint32* dst = (uint32*)(dst_argb);

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = src[1];

+    dst[1] = src[3];

+    src += 4;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    dst[0] = src[1];

+  }

+}

+void ScaleARGBRowDown2Linear_C(const uint8* src_argb,

+                               ptrdiff_t src_stride,

+                               uint8* dst_argb, int dst_width) {

+  int x;

+  for (x = 0; x < dst_width; ++x) {

+    dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;

+    dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;

+    dst_argb[2] = (src_argb[2] + src_argb[6] + 1) >> 1;

+    dst_argb[3] = (src_argb[3] + src_argb[7] + 1) >> 1;

+    src_argb += 8;

+    dst_argb += 4;

+  }

+}

+void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,

+                            uint8* dst_argb, int dst_width) {

+  int x;

+  for (x = 0; x < dst_width; ++x) {

+    dst_argb[0] = (src_argb[0] + src_argb[4] +

+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;

+    dst_argb[1] = (src_argb[1] + src_argb[5] +

+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;

+    dst_argb[2] = (src_argb[2] + src_argb[6] +

+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;

+    dst_argb[3] = (src_argb[3] + src_argb[7] +

+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;

+    src_argb += 8;

+    dst_argb += 4;

+  }

+}

+void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,

+                            int src_stepx,

+                            uint8* dst_argb, int dst_width) {

+  const uint32* src = (const uint32*)(src_argb);

+  uint32* dst = (uint32*)(dst_argb);

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    dst[0] = src[0];

+    dst[1] = src[src_stepx];

+    src += src_stepx * 2;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    dst[0] = src[0];

+  }

+}

+void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,

+                               ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8* dst_argb, int dst_width) {

+  int x;

+  for (x = 0; x < dst_width; ++x) {

+    dst_argb[0] = (src_argb[0] + src_argb[4] +

+                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;

+    dst_argb[1] = (src_argb[1] + src_argb[5] +

+                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;

+    dst_argb[2] = (src_argb[2] + src_argb[6] +

+                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;

+    dst_argb[3] = (src_argb[3] + src_argb[7] +

+                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;

+    src_argb += src_stepx * 4;

+    dst_argb += 4;

+  }

+}

+// Scales a single row of pixels using point sampling.

+void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,

+                     int dst_width, int x, int dx) {

+  const uint32* src = (const uint32*)(src_argb);

+  uint32* dst = (uint32*)(dst_argb);

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    dst[0] = src[x >> 16];

+    x += dx;

+    dst[1] = src[x >> 16];

+    x += dx;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    dst[0] = src[x >> 16];

+  }

+}

+void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,

+                       int dst_width, int x32, int dx) {

+  int64 x = (int64)(x32);

+  const uint32* src = (const uint32*)(src_argb);

+  uint32* dst = (uint32*)(dst_argb);

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    dst[0] = src[x >> 16];

+    x += dx;

+    dst[1] = src[x >> 16];

+    x += dx;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    dst[0] = src[x >> 16];

+  }

+}

+// Scales a single row of pixels up by 2x using point sampling.

+void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,

+                        int dst_width, int x, int dx) {

+  const uint32* src = (const uint32*)(src_argb);

+  uint32* dst = (uint32*)(dst_argb);

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    dst[1] = dst[0] = src[0];

+    src += 1;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    dst[0] = src[0];

+  }

+}

+// Mimics SSSE3 blender

+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7

+#define BLENDERC(a, b, f, s) (uint32)( \

+    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)

+#define BLENDER(a, b, f) \

+    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \

+    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)

+void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,

+                           int dst_width, int x, int dx) {

+  const uint32* src = (const uint32*)(src_argb);

+  uint32* dst = (uint32*)(dst_argb);

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    int xi = x >> 16;

+    int xf = (x >> 9) & 0x7f;

+    uint32 a = src[xi];

+    uint32 b = src[xi + 1];

+    dst[0] = BLENDER(a, b, xf);

+    x += dx;

+    xi = x >> 16;

+    xf = (x >> 9) & 0x7f;

+    a = src[xi];

+    b = src[xi + 1];

+    dst[1] = BLENDER(a, b, xf);

+    x += dx;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    int xi = x >> 16;

+    int xf = (x >> 9) & 0x7f;

+    uint32 a = src[xi];

+    uint32 b = src[xi + 1];

+    dst[0] = BLENDER(a, b, xf);

+  }

+}

+void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,

+                             int dst_width, int x32, int dx) {

+  int64 x = (int64)(x32);

+  const uint32* src = (const uint32*)(src_argb);

+  uint32* dst = (uint32*)(dst_argb);

+  int j;

+  for (j = 0; j < dst_width - 1; j += 2) {

+    int64 xi = x >> 16;

+    int xf = (x >> 9) & 0x7f;

+    uint32 a = src[xi];

+    uint32 b = src[xi + 1];

+    dst[0] = BLENDER(a, b, xf);

+    x += dx;

+    xi = x >> 16;

+    xf = (x >> 9) & 0x7f;

+    a = src[xi];

+    b = src[xi + 1];

+    dst[1] = BLENDER(a, b, xf);

+    x += dx;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    int64 xi = x >> 16;

+    int xf = (x >> 9) & 0x7f;

+    uint32 a = src[xi];

+    uint32 b = src[xi + 1];

+    dst[0] = BLENDER(a, b, xf);

+  }

+}

+#undef BLENDER1

+#undef BLENDERC

+#undef BLENDER

+// Scale plane vertically with bilinear interpolation.

+void ScalePlaneVertical(int src_height,

+                        int dst_width, int dst_height,

+                        int src_stride, int dst_stride,

+                        const uint8* src_argb, uint8* dst_argb,

+                        int x, int y, int dy,

+                        int bpp, enum FilterMode filtering) {

+  // TODO(fbarchard): Allow higher bpp.

+  int dst_width_bytes = dst_width * bpp;

+  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_C;

+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;

+  int j;

+  assert(bpp >= 1 && bpp <= 4);

+  assert(src_height != 0);

+  assert(dst_width > 0);

+  assert(dst_height > 0);

+  src_argb += (x >> 16) * bpp;

+#if defined(HAS_INTERPOLATEROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSE2;

+    if (IS_ALIGNED(dst_width_bytes, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSE2;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {

+    InterpolateRow = InterpolateRow_Any_SSSE3;

+    if (IS_ALIGNED(dst_width_bytes, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {

+    InterpolateRow = InterpolateRow_Any_AVX2;

+    if (IS_ALIGNED(dst_width_bytes, 32)) {

+      InterpolateRow = InterpolateRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {

+    InterpolateRow = InterpolateRow_Any_NEON;

+    if (IS_ALIGNED(dst_width_bytes, 16)) {

+      InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&

+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

+    InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;

+    if (IS_ALIGNED(dst_width_bytes, 4)) {

+      InterpolateRow = InterpolateRow_MIPS_DSPR2;

+    }

+  }

+#endif

+  for (j = 0; j < dst_height; ++j) {

+    int yi;

+    int yf;

+    if (y > max_y) {

+      y = max_y;

+    }

+    yi = y >> 16;

+    yf = filtering ? ((y >> 8) & 255) : 0;

+    InterpolateRow(dst_argb, src_argb + yi * src_stride,

+                   src_stride, dst_width_bytes, yf);

+    dst_argb += dst_stride;

+    y += dy;

+  }

+}

+void ScalePlaneVertical_16(int src_height,

+                           int dst_width, int dst_height,

+                           int src_stride, int dst_stride,

+                           const uint16* src_argb, uint16* dst_argb,

+                           int x, int y, int dy,

+                           int wpp, enum FilterMode filtering) {

+  // TODO(fbarchard): Allow higher wpp.

+  int dst_width_words = dst_width * wpp;

+  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,

+      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

+      InterpolateRow_16_C;

+  const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;

+  int j;

+  assert(wpp >= 1 && wpp <= 2);

+  assert(src_height != 0);

+  assert(dst_width > 0);

+  assert(dst_height > 0);

+  src_argb += (x >> 16) * wpp;

+#if defined(HAS_INTERPOLATEROW_16_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2) && dst_width_bytes >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_SSE2;

+    if (IS_ALIGNED(dst_width_bytes, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_16_SSE2;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_16_SSE2;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3) && dst_width_bytes >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_SSSE3;

+    if (IS_ALIGNED(dst_width_bytes, 16)) {

+      InterpolateRow = InterpolateRow_Unaligned_16_SSSE3;

+      if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&

+          IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {

+        InterpolateRow = InterpolateRow_16_SSSE3;

+      }

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2) && dst_width_bytes >= 32) {

+    InterpolateRow = InterpolateRow_Any_16_AVX2;

+    if (IS_ALIGNED(dst_width_bytes, 32)) {

+      InterpolateRow = InterpolateRow_16_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_16_NEON)

+  if (TestCpuFlag(kCpuHasNEON) && dst_width_bytes >= 16) {

+    InterpolateRow = InterpolateRow_Any_16_NEON;

+    if (IS_ALIGNED(dst_width_bytes, 16)) {

+      InterpolateRow = InterpolateRow_16_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROWS_16_MIPS_DSPR2)

+  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width_bytes >= 4 &&

+      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&

+      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

+    InterpolateRow = InterpolateRow_Any_16_MIPS_DSPR2;

+    if (IS_ALIGNED(dst_width_bytes, 4)) {

+      InterpolateRow = InterpolateRow_16_MIPS_DSPR2;

+    }

+  }

+#endif

+  for (j = 0; j < dst_height; ++j) {

+    int yi;

+    int yf;

+    if (y > max_y) {

+      y = max_y;

+    }

+    yi = y >> 16;

+    yf = filtering ? ((y >> 8) & 255) : 0;

+    InterpolateRow(dst_argb, src_argb + yi * src_stride,

+                   src_stride, dst_width_words, yf);

+    dst_argb += dst_stride;

+    y += dy;

+  }

+}

+// Simplify the filtering based on scale factors.

+enum FilterMode ScaleFilterReduce(int src_width, int src_height,

+                                  int dst_width, int dst_height,

+                                  enum FilterMode filtering) {

+  if (src_width < 0) {

+    src_width = -src_width;

+  }

+  if (src_height < 0) {

+    src_height = -src_height;

+  }

+  if (filtering == kFilterBox) {

+    // If scaling both axis to 0.5 or larger, switch from Box to Bilinear.

+    if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) {

+      filtering = kFilterBilinear;

+    }

+    // If scaling to larger, switch from Box to Bilinear.

+    if (dst_width >= src_width || dst_height >= src_height) {

+      filtering = kFilterBilinear;

+    }

+  }

+  if (filtering == kFilterBilinear) {

+    if (src_height == 1) {

+      filtering = kFilterLinear;

+    }

+    // TODO(fbarchard): Detect any odd scale factor and reduce to Linear.

+    if (dst_height == src_height || dst_height * 3 == src_height) {

+      filtering = kFilterLinear;

+    }

+    // TODO(fbarchard): Remove 1 pixel wide filter restriction, which is to

+    // avoid reading 2 pixels horizontally that causes memory exception.

+    if (src_width == 1) {

+      filtering = kFilterNone;

+    }

+  }

+  if (filtering == kFilterLinear) {

+    if (src_width == 1) {

+      filtering = kFilterNone;

+    }

+    // TODO(fbarchard): Detect any odd scale factor and reduce to None.

+    if (dst_width == src_width || dst_width * 3 == src_width) {

+      filtering = kFilterNone;

+    }

+  }

+  return filtering;

+}

+// Divide num by div and return as 16.16 fixed point result.

+int FixedDiv_C(int num, int div) {

+  return (int)(((int64)(num) << 16) / div);

+}

+// Divide num by div and return as 16.16 fixed point result.

+int FixedDiv1_C(int num, int div) {

+  return (int)((((int64)(num) << 16) - 0x00010001) /

+                          (div - 1));

+}

+#define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)

+// Compute slope values for stepping.

+void ScaleSlope(int src_width, int src_height,

+                int dst_width, int dst_height,

+                enum FilterMode filtering,

+                int* x, int* y, int* dx, int* dy) {

+  assert(x != NULL);

+  assert(y != NULL);

+  assert(dx != NULL);

+  assert(dy != NULL);

+  assert(src_width != 0);

+  assert(src_height != 0);

+  assert(dst_width > 0);

+  assert(dst_height > 0);

+  // Check for 1 pixel and avoid FixedDiv overflow.

+  if (dst_width == 1 && src_width >= 32768) {

+    dst_width = src_width;

+  }

+  if (dst_height == 1 && src_height >= 32768) {

+    dst_height = src_height;

+  }

+  if (filtering == kFilterBox) {

+    // Scale step for point sampling duplicates all pixels equally.

+    *dx = FixedDiv(Abs(src_width), dst_width);

+    *dy = FixedDiv(src_height, dst_height);

+    *x = 0;

+    *y = 0;

+  } else if (filtering == kFilterBilinear) {

+    // Scale step for bilinear sampling renders last pixel once for upsample.

+    if (dst_width <= Abs(src_width)) {

+      *dx = FixedDiv(Abs(src_width), dst_width);

+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.

+    } else if (dst_width > 1) {

+      *dx = FixedDiv1(Abs(src_width), dst_width);

+      *x = 0;

+    }

+    if (dst_height <= src_height) {

+      *dy = FixedDiv(src_height,  dst_height);

+      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.

+    } else if (dst_height > 1) {

+      *dy = FixedDiv1(src_height, dst_height);

+      *y = 0;

+    }

+  } else if (filtering == kFilterLinear) {

+    // Scale step for bilinear sampling renders last pixel once for upsample.

+    if (dst_width <= Abs(src_width)) {

+      *dx = FixedDiv(Abs(src_width), dst_width);

+      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.

+    } else if (dst_width > 1) {

+      *dx = FixedDiv1(Abs(src_width), dst_width);

+      *x = 0;

+    }

+    *dy = FixedDiv(src_height, dst_height);

+    *y = *dy >> 1;

+  } else {

+    // Scale step for point sampling duplicates all pixels equally.

+    *dx = FixedDiv(Abs(src_width), dst_width);

+    *dy = FixedDiv(src_height, dst_height);

+    *x = CENTERSTART(*dx, 0);

+    *y = CENTERSTART(*dy, 0);

+  }

+  // Negative src_width means horizontally mirror.

+  if (src_width < 0) {

+    *x += (dst_width - 1) * *dx;

+    *dx = -*dx;

+    // src_width = -src_width;   // Caller must do this.

+  }

+}

+#undef CENTERSTART

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/scale_mips.cc

@@ -1,0 +1,653 @@

+/*

+ *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/basic_types.h"

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for GCC MIPS DSPR2

+#if !defined(LIBYUV_DISABLE_MIPS) && \

+    defined(__mips_dsp) && (__mips_dsp_rev >= 2)

+void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst, int dst_width) {

+  __asm__ __volatile__(

+    ".set push                                     \n"

+    ".set noreorder                                \n"

+    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16

+    "beqz           $t9, 2f                        \n"

+    " nop                                          \n"

+    ".p2align       2                              \n"

+  "1:                                              \n"

+    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|

+    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|

+    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|

+    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|

+    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|

+    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|

+    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|

+    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|

+    // TODO(fbarchard): Use odd pixels instead of even.

+    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|

+    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|

+    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|

+    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|

+    "addiu          %[src_ptr], %[src_ptr], 32     \n"

+    "addiu          $t9, $t9, -1                   \n"

+    "sw             $t8, 0(%[dst])                 \n"

+    "sw             $t0, 4(%[dst])                 \n"

+    "sw             $t1, 8(%[dst])                 \n"

+    "sw             $t2, 12(%[dst])                \n"

+    "bgtz           $t9, 1b                        \n"

+    " addiu         %[dst], %[dst], 16             \n"

+  "2:                                              \n"

+    "andi           $t9, %[dst_width], 0xf         \n"  // residue

+    "beqz           $t9, 3f                        \n"

+    " nop                                          \n"

+  "21:                                             \n"

+    "lbu            $t0, 0(%[src_ptr])             \n"

+    "addiu          %[src_ptr], %[src_ptr], 2      \n"

+    "addiu          $t9, $t9, -1                   \n"

+    "sb             $t0, 0(%[dst])                 \n"

+    "bgtz           $t9, 21b                       \n"

+    " addiu         %[dst], %[dst], 1              \n"

+  "3:                                              \n"

+    ".set pop                                      \n"

+  : [src_ptr] "+r" (src_ptr),

+    [dst] "+r" (dst)

+  : [dst_width] "r" (dst_width)

+  : "t0", "t1", "t2", "t3", "t4", "t5",

+    "t6", "t7", "t8", "t9"

+  );

+}

+void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                 uint8* dst, int dst_width) {

+  const uint8* t = src_ptr + src_stride;

+  __asm__ __volatile__ (

+    ".set push                                    \n"

+    ".set noreorder                               \n"

+    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8

+    "bltz           $t9, 2f                       \n"

+    " nop                                         \n"

+    ".p2align       2                             \n"

+  "1:                                             \n"

+    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|

+    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|

+    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|

+    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|

+    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|

+    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|

+    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|

+    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|

+    "addiu          $t9, $t9, -1                  \n"

+    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|

+    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|

+    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|

+    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|

+    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|

+    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2

+    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2

+    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|

+    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|

+    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|

+    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|

+    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|

+    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2

+    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2

+    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|

+    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|

+    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|

+    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|

+    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|

+    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2

+    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2

+    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|

+    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|

+    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|

+    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|

+    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|

+    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2

+    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2

+    "addiu          %[src_ptr], %[src_ptr], 16    \n"

+    "addiu          %[t], %[t], 16                \n"

+    "sb             $t0, 0(%[dst])                \n"

+    "sb             $t4, 1(%[dst])                \n"

+    "sb             $t1, 2(%[dst])                \n"

+    "sb             $t5, 3(%[dst])                \n"

+    "sb             $t2, 4(%[dst])                \n"

+    "sb             $t6, 5(%[dst])                \n"

+    "sb             $t3, 6(%[dst])                \n"

+    "sb             $t7, 7(%[dst])                \n"

+    "bgtz           $t9, 1b                       \n"

+    " addiu         %[dst], %[dst], 8             \n"

+  "2:                                             \n"

+    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue

+    "beqz           $t9, 3f                       \n"

+    " nop                                         \n"

+    "21:                                          \n"

+    "lwr            $t1, 0(%[src_ptr])            \n"

+    "lwl            $t1, 3(%[src_ptr])            \n"

+    "lwr            $t2, 0(%[t])                  \n"

+    "lwl            $t2, 3(%[t])                  \n"

+    "srl            $t8, $t1, 16                  \n"

+    "ins            $t1, $t2, 16, 16              \n"

+    "ins            $t2, $t8, 0, 16               \n"

+    "raddu.w.qb     $t1, $t1                      \n"

+    "raddu.w.qb     $t2, $t2                      \n"

+    "shra_r.w       $t1, $t1, 2                   \n"

+    "shra_r.w       $t2, $t2, 2                   \n"

+    "sb             $t1, 0(%[dst])                \n"

+    "sb             $t2, 1(%[dst])                \n"

+    "addiu          %[src_ptr], %[src_ptr], 4     \n"

+    "addiu          $t9, $t9, -2                  \n"

+    "addiu          %[t], %[t], 4                 \n"

+    "bgtz           $t9, 21b                      \n"

+    " addiu         %[dst], %[dst], 2             \n"

+  "3:                                             \n"

+    ".set pop                                     \n"

+  : [src_ptr] "+r" (src_ptr),

+    [dst] "+r" (dst), [t] "+r" (t)

+  : [dst_width] "r" (dst_width)

+  : "t0", "t1", "t2", "t3", "t4", "t5",

+    "t6", "t7", "t8", "t9"

+  );

+}

+void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst, int dst_width) {

+  __asm__ __volatile__ (

+      ".set push                                    \n"

+      ".set noreorder                               \n"

+      "srl            $t9, %[dst_width], 3          \n"

+      "beqz           $t9, 2f                       \n"

+      " nop                                         \n"

+      ".p2align       2                             \n"

+     "1:                                            \n"

+      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|

+      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|

+      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|

+      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|

+      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|

+      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|

+      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|

+      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|

+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|

+      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|

+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|

+      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|

+      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|

+      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|

+      "addiu          %[src_ptr], %[src_ptr], 32    \n"

+      "addiu          $t9, $t9, -1                  \n"

+      "sw             $t1, 0(%[dst])                \n"

+      "sw             $t5, 4(%[dst])                \n"

+      "bgtz           $t9, 1b                       \n"

+      " addiu         %[dst], %[dst], 8             \n"

+    "2:                                             \n"

+      "andi           $t9, %[dst_width], 7          \n"  // residue

+      "beqz           $t9, 3f                       \n"

+      " nop                                         \n"

+    "21:                                            \n"

+      "lbu            $t1, 0(%[src_ptr])            \n"

+      "addiu          %[src_ptr], %[src_ptr], 4     \n"

+      "addiu          $t9, $t9, -1                  \n"

+      "sb             $t1, 0(%[dst])                \n"

+      "bgtz           $t9, 21b                      \n"

+      " addiu         %[dst], %[dst], 1             \n"

+    "3:                                             \n"

+      ".set pop                                     \n"

+      : [src_ptr] "+r" (src_ptr),

+        [dst] "+r" (dst)

+      : [dst_width] "r" (dst_width)

+      : "t1", "t2", "t3", "t4", "t5",

+        "t6", "t7", "t8", "t9"

+  );

+}

+void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                 uint8* dst, int dst_width) {

+  intptr_t stride = src_stride;

+  const uint8* s1 = src_ptr + stride;

+  const uint8* s2 = s1 + stride;

+  const uint8* s3 = s2 + stride;

+  __asm__ __volatile__ (

+      ".set push                                  \n"

+      ".set noreorder                             \n"

+      "srl           $t9, %[dst_width], 1         \n"

+      "andi          $t8, %[dst_width], 1         \n"

+      ".p2align      2                            \n"

+     "1:                                          \n"

+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|

+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|

+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|

+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|

+      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|

+      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|

+      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|

+      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|

+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|

+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|

+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|

+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|

+      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|

+      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|

+      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|

+      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|

+      "add           $t0, $t0, $t1                \n"

+      "add           $t1, $t2, $t3                \n"

+      "add           $t0, $t0, $t1                \n"

+      "add           $t4, $t4, $t5                \n"

+      "add           $t6, $t6, $t7                \n"

+      "add           $t4, $t4, $t6                \n"

+      "shra_r.w      $t0, $t0, 4                  \n"

+      "shra_r.w      $t4, $t4, 4                  \n"

+      "sb            $t0, 0(%[dst])               \n"

+      "sb            $t4, 1(%[dst])               \n"

+      "addiu         %[src_ptr], %[src_ptr], 8    \n"

+      "addiu         %[s1], %[s1], 8              \n"

+      "addiu         %[s2], %[s2], 8              \n"

+      "addiu         %[s3], %[s3], 8              \n"

+      "addiu         $t9, $t9, -1                 \n"

+      "bgtz          $t9, 1b                      \n"

+      " addiu        %[dst], %[dst], 2            \n"

+      "beqz          $t8, 2f                      \n"

+      " nop                                       \n"

+      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|

+      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|

+      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|

+      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|

+      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|

+      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|

+      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|

+      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|

+      "add           $t0, $t0, $t1                \n"

+      "add           $t1, $t2, $t3                \n"

+      "add           $t0, $t0, $t1                \n"

+      "shra_r.w      $t0, $t0, 4                  \n"

+      "sb            $t0, 0(%[dst])               \n"

+      "2:                                         \n"

+      ".set pop                                   \n"

+      : [src_ptr] "+r" (src_ptr),

+        [dst] "+r" (dst),

+        [s1] "+r" (s1),

+        [s2] "+r" (s2),

+        [s3] "+r" (s3)

+      : [dst_width] "r" (dst_width)

+      : "t0", "t1", "t2", "t3", "t4", "t5",

+        "t6","t7", "t8", "t9"

+  );

+}

+void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst, int dst_width) {

+  __asm__ __volatile__ (

+      ".set push                                          \n"

+      ".set noreorder                                     \n"

+      ".p2align        2                                  \n"

+    "1:                                                   \n"

+      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|

+      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|

+      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|

+      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|

+      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|

+      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|

+      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|

+      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|

+      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|

+      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|

+      "addiu           %[dst_width], %[dst_width], -24    \n"

+      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|

+      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|

+      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|

+      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|

+      "addiu           %[src_ptr], %[src_ptr], 32         \n"

+      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|

+      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|

+      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|

+      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|

+      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|

+      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|

+      "sw              $t1, 0(%[dst])                     \n"

+      "sw              $t0, 4(%[dst])                     \n"

+      "sw              $t3, 8(%[dst])                     \n"

+      "sw              $t5, 12(%[dst])                    \n"

+      "sw              $t9, 16(%[dst])                    \n"

+      "sw              $t7, 20(%[dst])                    \n"

+      "bnez            %[dst_width], 1b                   \n"

+      " addiu          %[dst], %[dst], 24                 \n"

+      ".set pop                                           \n"

+      : [src_ptr] "+r" (src_ptr),

+        [dst] "+r" (dst),

+        [dst_width] "+r" (dst_width)

+      :

+      : "t0", "t1", "t2", "t3", "t4", "t5",

+        "t6","t7", "t8", "t9"

+  );

+}

+void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                     uint8* d, int dst_width) {

+  __asm__ __volatile__ (

+      ".set push                                         \n"

+      ".set noreorder                                    \n"

+      "repl.ph           $t3, 3                          \n"  // 0x00030003

+     ".p2align           2                               \n"

+    "1:                                                  \n"

+      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|

+      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|

+      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|

+      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|

+      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|

+      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|

+      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|

+      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|

+      "raddu.w.qb        $t0, $t0                        \n"

+      "raddu.w.qb        $t1, $t1                        \n"

+      "shra_r.w          $t0, $t0, 1                     \n"

+      "shra_r.w          $t1, $t1, 1                     \n"

+      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|

+      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|

+      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|

+      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|

+      "addu.ph           $t2, $t2, $t4                   \n"

+      "addu.ph           $t6, $t6, $t5                   \n"

+      "sll               $t5, $t0, 1                     \n"

+      "add               $t0, $t5, $t0                   \n"

+      "shra_r.ph         $t2, $t2, 2                     \n"

+      "shra_r.ph         $t6, $t6, 2                     \n"

+      "shll.ph           $t4, $t2, 1                     \n"

+      "addq.ph           $t4, $t4, $t2                   \n"

+      "addu              $t0, $t0, $t1                   \n"

+      "addiu             %[src_ptr], %[src_ptr], 4       \n"

+      "shra_r.w          $t0, $t0, 2                     \n"

+      "addu.ph           $t6, $t6, $t4                   \n"

+      "shra_r.ph         $t6, $t6, 2                     \n"

+      "srl               $t1, $t6, 16                    \n"

+      "addiu             %[dst_width], %[dst_width], -3  \n"

+      "sb                $t1, 0(%[d])                    \n"

+      "sb                $t0, 1(%[d])                    \n"

+      "sb                $t6, 2(%[d])                    \n"

+      "bgtz              %[dst_width], 1b                \n"

+      " addiu            %[d], %[d], 3                   \n"

+    "3:                                                  \n"

+      ".set pop                                          \n"

+      : [src_ptr] "+r" (src_ptr),

+        [src_stride] "+r" (src_stride),

+        [d] "+r" (d),

+        [dst_width] "+r" (dst_width)

+      :

+      : "t0", "t1", "t2", "t3",

+        "t4", "t5", "t6"

+  );

+}

+void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                     uint8* d, int dst_width) {

+  __asm__ __volatile__ (

+      ".set push                                           \n"

+      ".set noreorder                                      \n"

+      "repl.ph           $t2, 3                            \n"  // 0x00030003

+      ".p2align          2                                 \n"

+    "1:                                                    \n"

+      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

+      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|

+      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|

+      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|

+      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|

+      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|

+      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|

+      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|

+      "raddu.w.qb        $t0, $t0                          \n"

+      "raddu.w.qb        $t1, $t1                          \n"

+      "shra_r.w          $t0, $t0, 1                       \n"

+      "shra_r.w          $t1, $t1, 1                       \n"

+      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|

+      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|

+      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|

+      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|

+      "addu.ph           $t4, $t4, $t3                     \n"

+      "addu.ph           $t6, $t6, $t5                     \n"

+      "shra_r.ph         $t6, $t6, 2                       \n"

+      "shra_r.ph         $t4, $t4, 2                       \n"

+      "addu.ph           $t6, $t6, $t4                     \n"

+      "addiu             %[src_ptr], %[src_ptr], 4         \n"

+      "shra_r.ph         $t6, $t6, 1                       \n"

+      "addu              $t0, $t0, $t1                     \n"

+      "addiu             %[dst_width], %[dst_width], -3    \n"

+      "shra_r.w          $t0, $t0, 1                       \n"

+      "srl               $t1, $t6, 16                      \n"

+      "sb                $t1, 0(%[d])                      \n"

+      "sb                $t0, 1(%[d])                      \n"

+      "sb                $t6, 2(%[d])                      \n"

+      "bgtz              %[dst_width], 1b                  \n"

+      " addiu            %[d], %[d], 3                     \n"

+    "3:                                                    \n"

+      ".set pop                                            \n"

+      : [src_ptr] "+r" (src_ptr),

+        [src_stride] "+r" (src_stride),

+        [d] "+r" (d),

+        [dst_width] "+r" (dst_width)

+      :

+      : "t0", "t1", "t2", "t3",

+        "t4", "t5", "t6"

+  );

+}

+void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst, int dst_width) {

+  __asm__ __volatile__ (

+      ".set push                                     \n"

+      ".set noreorder                                \n"

+      ".p2align   2                                  \n"

+    "1:                                              \n"

+      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|

+      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|

+      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|

+      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|

+      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|

+      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|

+      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|

+      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|

+      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|

+      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|

+      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|

+      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|

+      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|

+      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|

+      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|

+      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|

+      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|

+      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|

+      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|

+      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|

+      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|

+      "addiu      %[src_ptr], %[src_ptr], 32         \n"

+      "addiu      %[dst_width], %[dst_width], -12    \n"

+      "addiu      $t8,%[dst_width], -12              \n"

+      "sw         $t1, 0(%[dst])                     \n"

+      "sw         $t4, 4(%[dst])                     \n"

+      "sw         $t6, 8(%[dst])                     \n"

+      "bgez       $t8, 1b                            \n"

+      " addiu     %[dst], %[dst], 12                 \n"

+      ".set pop                                      \n"

+      : [src_ptr] "+r" (src_ptr),

+        [dst] "+r" (dst),

+        [dst_width] "+r" (dst_width)

+      :

+      : "t0", "t1", "t2", "t3", "t4",

+        "t5", "t6", "t7", "t8"

+  );

+}

+void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                     uint8* dst_ptr, int dst_width) {

+  intptr_t stride = src_stride;

+  const uint8* t = src_ptr + stride;

+  const int c = 0x2AAA;

+  __asm__ __volatile__ (

+      ".set push                                         \n"

+      ".set noreorder                                    \n"

+      ".p2align        2                                 \n"

+    "1:                                                  \n"

+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

+      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|

+      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|

+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|

+      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|

+      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|

+      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6

+      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4

+      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|

+      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|

+      "srl             $t4, $t4, 2                       \n"  // t4 / 4

+      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|

+      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3

+      "addu            $t6, $t5, $t6                     \n"

+      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA

+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|

+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|

+      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0

+      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0

+      "addu            $t0, $t0, $t2                     \n"

+      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA

+      "addiu           %[src_ptr], %[src_ptr], 8         \n"

+      "addiu           %[t], %[t], 8                     \n"

+      "addiu           %[dst_width], %[dst_width], -3    \n"

+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"

+      "srl             $t6, $t6, 16                      \n"

+      "srl             $t0, $t0, 16                      \n"

+      "sb              $t4, -1(%[dst_ptr])               \n"

+      "sb              $t6, -2(%[dst_ptr])               \n"

+      "bgtz            %[dst_width], 1b                  \n"

+      " sb             $t0, -3(%[dst_ptr])               \n"

+      ".set pop                                          \n"

+      : [src_ptr] "+r" (src_ptr),

+        [dst_ptr] "+r" (dst_ptr),

+        [t] "+r" (t),

+        [dst_width] "+r" (dst_width)

+      : [c] "r" (c)

+      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"

+  );

+}

+void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,

+                                     ptrdiff_t src_stride,

+                                     uint8* dst_ptr, int dst_width) {

+  intptr_t stride = src_stride;

+  const uint8* s1 = src_ptr + stride;

+  stride += stride;

+  const uint8* s2 = src_ptr + stride;

+  const int c1 = 0x1C71;

+  const int c2 = 0x2AAA;

+  __asm__ __volatile__ (

+      ".set push                                         \n"

+      ".set noreorder                                    \n"

+      ".p2align        2                                 \n"

+    "1:                                                  \n"

+      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

+      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

+      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|

+      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|

+      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|

+      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|

+      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|

+      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|

+      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6

+      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|

+      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4

+      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|

+      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4

+      "addu            $t7, $t7, $t8                     \n"

+      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|

+      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6

+      "addu            $t6, $t6, $t8                     \n"

+      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA

+      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|

+      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|

+      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|

+      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3

+      "addu            $t7, $t7, $t8                     \n"

+      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71

+      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|

+      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|

+      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|

+      "raddu.w.qb      $t0, $t0                          \n"

+      "raddu.w.qb      $t2, $t2                          \n"

+      "raddu.w.qb      $t4, $t4                          \n"

+      "addu            $t0, $t0, $t2                     \n"

+      "addu            $t0, $t0, $t4                     \n"

+      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71

+      "addiu           %[src_ptr], %[src_ptr], 8         \n"

+      "addiu           %[s1], %[s1], 8                   \n"

+      "addiu           %[s2], %[s2], 8                   \n"

+      "addiu           %[dst_width], %[dst_width], -3    \n"

+      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"

+      "srl             $t6, $t6, 16                      \n"

+      "srl             $t7, $t7, 16                      \n"

+      "srl             $t0, $t0, 16                      \n"

+      "sb              $t6, -1(%[dst_ptr])               \n"

+      "sb              $t7, -2(%[dst_ptr])               \n"

+      "bgtz            %[dst_width], 1b                  \n"

+      " sb             $t0, -3(%[dst_ptr])               \n"

+      ".set pop                                          \n"

+      : [src_ptr] "+r" (src_ptr),

+        [dst_ptr] "+r" (dst_ptr),

+        [s1] "+r" (s1),

+        [s2] "+r" (s2),

+        [dst_width] "+r" (dst_width)

+      : [c1] "r" (c1), [c2] "r" (c2)

+      : "t0", "t1", "t2", "t3", "t4",

+        "t5", "t6", "t7", "t8"

+  );

+}

+#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/scale_neon.cc

@@ -1,0 +1,684 @@

+/*

+ *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for GCC Neon.

+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)

+// NEON downscalers with interpolation.

+// Provided by Fritz Koenig

+// Read 32x1 throw away even pixels, and write 16x1.

+void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst, int dst_width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    // load even pixels into q0, odd into q1

+    "vld2.8     {q0, q1}, [%0]!                \n"

+    "subs       %2, %2, #16                    \n"  // 16 processed per loop

+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels

+    "bgt        1b                             \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst),              // %1

+    "+r"(dst_width)         // %2

+  :

+  : "q0", "q1"              // Clobber List

+  );

+}

+// Read 32x2 average down and write 16x1.

+void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst, int dst_width) {

+  asm volatile (

+    // change the stride to row 2 pointer

+    "add        %1, %0                         \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc

+    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc

+    "subs       %3, %3, #16                    \n"  // 16 processed per loop

+    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent

+    "vpaddl.u8  q1, q1                         \n"

+    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1

+    "vpadal.u8  q1, q3                         \n"

+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack

+    "vrshrn.u16 d1, q1, #2                     \n"

+    "vst1.8     {q0}, [%2]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(src_stride),       // %1

+    "+r"(dst),              // %2

+    "+r"(dst_width)         // %3

+  :

+  : "q0", "q1", "q2", "q3"     // Clobber List

+  );

+}

+void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0

+    "subs       %2, %2, #8                     \n" // 8 processed per loop

+    "vst1.8     {d2}, [%1]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst_ptr),          // %1

+    "+r"(dst_width)         // %2

+  :

+  : "q0", "q1", "memory", "cc"

+  );

+}

+void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "add        r4, %0, %3                     \n"

+    "add        r5, r4, %3                     \n"

+    "add        %3, r5, %3                     \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4

+    "vld1.8     {q1}, [r4]!                    \n"

+    "vld1.8     {q2}, [r5]!                    \n"

+    "vld1.8     {q3}, [%3]!                    \n"

+    "subs       %2, %2, #4                     \n"

+    "vpaddl.u8  q0, q0                         \n"

+    "vpadal.u8  q0, q1                         \n"

+    "vpadal.u8  q0, q2                         \n"

+    "vpadal.u8  q0, q3                         \n"

+    "vpaddl.u16 q0, q0                         \n"

+    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding

+    "vmovn.u16  d0, q0                         \n"

+    "vst1.32    {d0[0]}, [%1]!                 \n"

+    "bgt        1b                             \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst_ptr),          // %1

+    "+r"(dst_width)         // %2

+  : "r"(src_stride)         // %3

+  : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"

+  );

+}

+// Down scale from 4 to 3 pixels. Use the neon multilane read/write

+// to load up the every 4th pixel into a 4 different registers.

+// Point samples 32 pixels to 24 pixels.

+void ScaleRowDown34_NEON(const uint8* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0

+    "subs       %2, %2, #24                  \n"

+    "vmov       d2, d3                       \n" // order d0, d1, d2

+    "vst3.8     {d0, d1, d2}, [%1]!          \n"

+    "bgt        1b                           \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst_ptr),          // %1

+    "+r"(dst_width)         // %2

+  :

+  : "d0", "d1", "d2", "d3", "memory", "cc"

+  );

+}

+void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vmov.u8    d24, #3                        \n"

+    "add        %3, %0                         \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1

+    "subs         %2, %2, #24                  \n"

+    // filter src line 0 with src line 1

+    // expand chars to shorts to allow for room

+    // when adding lines together

+    "vmovl.u8     q8, d4                       \n"

+    "vmovl.u8     q9, d5                       \n"

+    "vmovl.u8     q10, d6                      \n"

+    "vmovl.u8     q11, d7                      \n"

+    // 3 * line_0 + line_1

+    "vmlal.u8     q8, d0, d24                  \n"

+    "vmlal.u8     q9, d1, d24                  \n"

+    "vmlal.u8     q10, d2, d24                 \n"

+    "vmlal.u8     q11, d3, d24                 \n"

+    // (3 * line_0 + line_1) >> 2

+    "vqrshrn.u16  d0, q8, #2                   \n"

+    "vqrshrn.u16  d1, q9, #2                   \n"

+    "vqrshrn.u16  d2, q10, #2                  \n"

+    "vqrshrn.u16  d3, q11, #2                  \n"

+    // a0 = (src[0] * 3 + s[1] * 1) >> 2

+    "vmovl.u8     q8, d1                       \n"

+    "vmlal.u8     q8, d0, d24                  \n"

+    "vqrshrn.u16  d0, q8, #2                   \n"

+    // a1 = (src[1] * 1 + s[2] * 1) >> 1

+    "vrhadd.u8    d1, d1, d2                   \n"

+    // a2 = (src[2] * 1 + s[3] * 3) >> 2

+    "vmovl.u8     q8, d2                       \n"

+    "vmlal.u8     q8, d3, d24                  \n"

+    "vqrshrn.u16  d2, q8, #2                   \n"

+    "vst3.8       {d0, d1, d2}, [%1]!          \n"

+    "bgt          1b                           \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst_ptr),          // %1

+    "+r"(dst_width),        // %2

+    "+r"(src_stride)        // %3

+  :

+  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"

+  );

+}

+void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vmov.u8    d24, #3                        \n"

+    "add        %3, %0                         \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1

+    "subs         %2, %2, #24                  \n"

+    // average src line 0 with src line 1

+    "vrhadd.u8    q0, q0, q2                   \n"

+    "vrhadd.u8    q1, q1, q3                   \n"

+    // a0 = (src[0] * 3 + s[1] * 1) >> 2

+    "vmovl.u8     q3, d1                       \n"

+    "vmlal.u8     q3, d0, d24                  \n"

+    "vqrshrn.u16  d0, q3, #2                   \n"

+    // a1 = (src[1] * 1 + s[2] * 1) >> 1

+    "vrhadd.u8    d1, d1, d2                   \n"

+    // a2 = (src[2] * 1 + s[3] * 3) >> 2

+    "vmovl.u8     q3, d2                       \n"

+    "vmlal.u8     q3, d3, d24                  \n"

+    "vqrshrn.u16  d2, q3, #2                   \n"

+    "vst3.8       {d0, d1, d2}, [%1]!          \n"

+    "bgt          1b                           \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst_ptr),          // %1

+    "+r"(dst_width),        // %2

+    "+r"(src_stride)        // %3

+  :

+  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"

+  );

+}

+#define HAS_SCALEROWDOWN38_NEON

+static uvec8 kShuf38 =

+  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };

+static uvec8 kShuf38_2 =

+  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };

+static vec16 kMult38_Div6 =

+  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,

+    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };

+static vec16 kMult38_Div9 =

+  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,

+    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

+// 32 -> 12

+void ScaleRowDown38_NEON(const uint8* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vld1.8     {q3}, [%3]                     \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"

+    "subs       %2, %2, #12                    \n"

+    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"

+    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"

+    "vst1.8     {d4}, [%1]!                    \n"

+    "vst1.32    {d5[0]}, [%1]!                 \n"

+    "bgt        1b                             \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst_ptr),          // %1

+    "+r"(dst_width)         // %2

+  : "r"(&kShuf38)           // %3

+  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"

+  );

+}

+// 32x3 -> 12x1

+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,

+                                      ptrdiff_t src_stride,

+                                      uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vld1.16    {q13}, [%4]                    \n"

+    "vld1.8     {q14}, [%5]                    \n"

+    "vld1.8     {q15}, [%6]                    \n"

+    "add        r4, %0, %3, lsl #1             \n"

+    "add        %3, %0                         \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    // d0 = 00 40 01 41 02 42 03 43

+    // d1 = 10 50 11 51 12 52 13 53

+    // d2 = 20 60 21 61 22 62 23 63

+    // d3 = 30 70 31 71 32 72 33 73

+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

+    "vld4.8       {d16, d17, d18, d19}, [r4]!  \n"

+    "subs         %2, %2, #12                  \n"

+    // Shuffle the input data around to get align the data

+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

+    // d0 = 00 10 01 11 02 12 03 13

+    // d1 = 40 50 41 51 42 52 43 53

+    "vtrn.u8      d0, d1                       \n"

+    "vtrn.u8      d4, d5                       \n"

+    "vtrn.u8      d16, d17                     \n"

+    // d2 = 20 30 21 31 22 32 23 33

+    // d3 = 60 70 61 71 62 72 63 73

+    "vtrn.u8      d2, d3                       \n"

+    "vtrn.u8      d6, d7                       \n"

+    "vtrn.u8      d18, d19                     \n"

+    // d0 = 00+10 01+11 02+12 03+13

+    // d2 = 40+50 41+51 42+52 43+53

+    "vpaddl.u8    q0, q0                       \n"

+    "vpaddl.u8    q2, q2                       \n"

+    "vpaddl.u8    q8, q8                       \n"

+    // d3 = 60+70 61+71 62+72 63+73

+    "vpaddl.u8    d3, d3                       \n"

+    "vpaddl.u8    d7, d7                       \n"

+    "vpaddl.u8    d19, d19                     \n"

+    // combine source lines

+    "vadd.u16     q0, q2                       \n"

+    "vadd.u16     q0, q8                       \n"

+    "vadd.u16     d4, d3, d7                   \n"

+    "vadd.u16     d4, d19                      \n"

+    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]

+    //             + s[6 + st * 1] + s[7 + st * 1]

+    //             + s[6 + st * 2] + s[7 + st * 2]) / 6

+    "vqrdmulh.s16 q2, q2, q13                  \n"

+    "vmovn.u16    d4, q2                       \n"

+    // Shuffle 2,3 reg around so that 2 can be added to the

+    //  0,1 reg and 3 can be added to the 4,5 reg. This

+    //  requires expanding from u8 to u16 as the 0,1 and 4,5

+    //  registers are already expanded. Then do transposes

+    //  to get aligned.

+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

+    "vmovl.u8     q1, d2                       \n"

+    "vmovl.u8     q3, d6                       \n"

+    "vmovl.u8     q9, d18                      \n"

+    // combine source lines

+    "vadd.u16     q1, q3                       \n"

+    "vadd.u16     q1, q9                       \n"

+    // d4 = xx 20 xx 30 xx 22 xx 32

+    // d5 = xx 21 xx 31 xx 23 xx 33

+    "vtrn.u32     d2, d3                       \n"

+    // d4 = xx 20 xx 21 xx 22 xx 23

+    // d5 = xx 30 xx 31 xx 32 xx 33

+    "vtrn.u16     d2, d3                       \n"

+    // 0+1+2, 3+4+5

+    "vadd.u16     q0, q1                       \n"

+    // Need to divide, but can't downshift as the the value

+    //  isn't a power of 2. So multiply by 65536 / n

+    //  and take the upper 16 bits.

+    "vqrdmulh.s16 q0, q0, q15                  \n"

+    // Align for table lookup, vtbl requires registers to

+    //  be adjacent

+    "vmov.u8      d2, d4                       \n"

+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

+    "vst1.8       {d3}, [%1]!                  \n"

+    "vst1.32      {d4[0]}, [%1]!               \n"

+    "bgt          1b                           \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst_ptr),          // %1

+    "+r"(dst_width),        // %2

+    "+r"(src_stride)        // %3

+  : "r"(&kMult38_Div6),     // %4

+    "r"(&kShuf38_2),        // %5

+    "r"(&kMult38_Div9)      // %6

+  : "r4", "q0", "q1", "q2", "q3", "q8", "q9",

+    "q13", "q14", "q15", "memory", "cc"

+  );

+}

+// 32x2 -> 12x1

+void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "vld1.16    {q13}, [%4]                    \n"

+    "vld1.8     {q14}, [%5]                    \n"

+    "add        %3, %0                         \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    // d0 = 00 40 01 41 02 42 03 43

+    // d1 = 10 50 11 51 12 52 13 53

+    // d2 = 20 60 21 61 22 62 23 63

+    // d3 = 30 70 31 71 32 72 33 73

+    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

+    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

+    "subs         %2, %2, #12                  \n"

+    // Shuffle the input data around to get align the data

+    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

+    // d0 = 00 10 01 11 02 12 03 13

+    // d1 = 40 50 41 51 42 52 43 53

+    "vtrn.u8      d0, d1                       \n"

+    "vtrn.u8      d4, d5                       \n"

+    // d2 = 20 30 21 31 22 32 23 33

+    // d3 = 60 70 61 71 62 72 63 73

+    "vtrn.u8      d2, d3                       \n"

+    "vtrn.u8      d6, d7                       \n"

+    // d0 = 00+10 01+11 02+12 03+13

+    // d2 = 40+50 41+51 42+52 43+53

+    "vpaddl.u8    q0, q0                       \n"

+    "vpaddl.u8    q2, q2                       \n"

+    // d3 = 60+70 61+71 62+72 63+73

+    "vpaddl.u8    d3, d3                       \n"

+    "vpaddl.u8    d7, d7                       \n"

+    // combine source lines

+    "vadd.u16     q0, q2                       \n"

+    "vadd.u16     d4, d3, d7                   \n"

+    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4

+    "vqrshrn.u16  d4, q2, #2                   \n"

+    // Shuffle 2,3 reg around so that 2 can be added to the

+    //  0,1 reg and 3 can be added to the 4,5 reg. This

+    //  requires expanding from u8 to u16 as the 0,1 and 4,5

+    //  registers are already expanded. Then do transposes

+    //  to get aligned.

+    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

+    "vmovl.u8     q1, d2                       \n"

+    "vmovl.u8     q3, d6                       \n"

+    // combine source lines

+    "vadd.u16     q1, q3                       \n"

+    // d4 = xx 20 xx 30 xx 22 xx 32

+    // d5 = xx 21 xx 31 xx 23 xx 33

+    "vtrn.u32     d2, d3                       \n"

+    // d4 = xx 20 xx 21 xx 22 xx 23

+    // d5 = xx 30 xx 31 xx 32 xx 33

+    "vtrn.u16     d2, d3                       \n"

+    // 0+1+2, 3+4+5

+    "vadd.u16     q0, q1                       \n"

+    // Need to divide, but can't downshift as the the value

+    //  isn't a power of 2. So multiply by 65536 / n

+    //  and take the upper 16 bits.

+    "vqrdmulh.s16 q0, q0, q13                  \n"

+    // Align for table lookup, vtbl requires registers to

+    //  be adjacent

+    "vmov.u8      d2, d4                       \n"

+    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

+    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

+    "vst1.8       {d3}, [%1]!                  \n"

+    "vst1.32      {d4[0]}, [%1]!               \n"

+    "bgt          1b                           \n"

+  : "+r"(src_ptr),       // %0

+    "+r"(dst_ptr),       // %1

+    "+r"(dst_width),     // %2

+    "+r"(src_stride)     // %3

+  : "r"(&kMult38_Div6),  // %4

+    "r"(&kShuf38_2)      // %5

+  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"

+  );

+}

+// 16x2 -> 16x1

+void ScaleFilterRows_NEON(uint8* dst_ptr,

+                          const uint8* src_ptr, ptrdiff_t src_stride,

+                          int dst_width, int source_y_fraction) {

+  asm volatile (

+    "cmp          %4, #0                       \n"

+    "beq          100f                         \n"

+    "add          %2, %1                       \n"

+    "cmp          %4, #64                      \n"

+    "beq          75f                          \n"

+    "cmp          %4, #128                     \n"

+    "beq          50f                          \n"

+    "cmp          %4, #192                     \n"

+    "beq          25f                          \n"

+    "vdup.8       d5, %4                       \n"

+    "rsb          %4, #256                     \n"

+    "vdup.8       d4, %4                       \n"

+    // General purpose row blend.

+  "1:                                          \n"

+    "vld1.8       {q0}, [%1]!                  \n"

+    "vld1.8       {q1}, [%2]!                  \n"

+    "subs         %3, %3, #16                  \n"

+    "vmull.u8     q13, d0, d4                  \n"

+    "vmull.u8     q14, d1, d4                  \n"

+    "vmlal.u8     q13, d2, d5                  \n"

+    "vmlal.u8     q14, d3, d5                  \n"

+    "vrshrn.u16   d0, q13, #8                  \n"

+    "vrshrn.u16   d1, q14, #8                  \n"

+    "vst1.8       {q0}, [%0]!                  \n"

+    "bgt          1b                           \n"

+    "b            99f                          \n"

+    // Blend 25 / 75.

+  "25:                                         \n"

+    "vld1.8       {q0}, [%1]!                  \n"

+    "vld1.8       {q1}, [%2]!                  \n"

+    "subs         %3, %3, #16                  \n"

+    "vrhadd.u8    q0, q1                       \n"

+    "vrhadd.u8    q0, q1                       \n"

+    "vst1.8       {q0}, [%0]!                  \n"

+    "bgt          25b                          \n"

+    "b            99f                          \n"

+    // Blend 50 / 50.

+  "50:                                         \n"

+    "vld1.8       {q0}, [%1]!                  \n"

+    "vld1.8       {q1}, [%2]!                  \n"

+    "subs         %3, %3, #16                  \n"

+    "vrhadd.u8    q0, q1                       \n"

+    "vst1.8       {q0}, [%0]!                  \n"

+    "bgt          50b                          \n"

+    "b            99f                          \n"

+    // Blend 75 / 25.

+  "75:                                         \n"

+    "vld1.8       {q1}, [%1]!                  \n"

+    "vld1.8       {q0}, [%2]!                  \n"

+    "subs         %3, %3, #16                  \n"

+    "vrhadd.u8    q0, q1                       \n"

+    "vrhadd.u8    q0, q1                       \n"

+    "vst1.8       {q0}, [%0]!                  \n"

+    "bgt          75b                          \n"

+    "b            99f                          \n"

+    // Blend 100 / 0 - Copy row unchanged.

+  "100:                                        \n"

+    "vld1.8       {q0}, [%1]!                  \n"

+    "subs         %3, %3, #16                  \n"

+    "vst1.8       {q0}, [%0]!                  \n"

+    "bgt          100b                         \n"

+  "99:                                         \n"

+    "vst1.8       {d1[7]}, [%0]                \n"

+  : "+r"(dst_ptr),          // %0

+    "+r"(src_ptr),          // %1

+    "+r"(src_stride),       // %2

+    "+r"(dst_width),        // %3

+    "+r"(source_y_fraction) // %4

+  :

+  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"

+  );

+}

+void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                            uint8* dst, int dst_width) {

+  asm volatile (

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    // load even pixels into q0, odd into q1

+    "vld2.32    {q0, q1}, [%0]!                \n"

+    "vld2.32    {q2, q3}, [%0]!                \n"

+    "subs       %2, %2, #8                     \n"  // 8 processed per loop

+    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels

+    "vst1.8     {q3}, [%1]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(dst),              // %1

+    "+r"(dst_width)         // %2

+  :

+  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List

+  );

+}

+void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

+                               uint8* dst, int dst_width) {

+  asm volatile (

+    // change the stride to row 2 pointer

+    "add        %1, %1, %0                     \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

+    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.

+    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.

+    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.

+    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.

+    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.

+    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.

+    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.

+    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack

+    "vrshrn.u16 d1, q1, #2                     \n"

+    "vrshrn.u16 d2, q2, #2                     \n"

+    "vrshrn.u16 d3, q3, #2                     \n"

+    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"

+    "bgt        1b                             \n"

+  : "+r"(src_ptr),          // %0

+    "+r"(src_stride),       // %1

+    "+r"(dst),              // %2

+    "+r"(dst_width)         // %3

+  :

+  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"

+  );

+}

+// Reads 4 pixels at a time.

+// Alignment requirement: src_argb 4 byte aligned.

+void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,

+                               int src_stepx, uint8* dst_argb, int dst_width) {

+  asm volatile (

+    "mov        r12, %3, lsl #2                \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.32    {d0[0]}, [%0], r12             \n"

+    "vld1.32    {d0[1]}, [%0], r12             \n"

+    "vld1.32    {d1[0]}, [%0], r12             \n"

+    "vld1.32    {d1[1]}, [%0], r12             \n"

+    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.

+    "vst1.8     {q0}, [%1]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_argb),    // %0

+    "+r"(dst_argb),    // %1

+    "+r"(dst_width)    // %2

+  : "r"(src_stepx)     // %3

+  : "memory", "cc", "r12", "q0"

+  );

+}

+// Reads 4 pixels at a time.

+// Alignment requirement: src_argb 4 byte aligned.

+void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,

+                                  int src_stepx,

+                                  uint8* dst_argb, int dst_width) {

+  asm volatile (

+    "mov        r12, %4, lsl #2                \n"

+    "add        %1, %1, %0                     \n"

+    ".p2align   2                              \n"

+  "1:                                          \n"

+    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1

+    "vld1.8     {d1}, [%1], r12                \n"

+    "vld1.8     {d2}, [%0], r12                \n"

+    "vld1.8     {d3}, [%1], r12                \n"

+    "vld1.8     {d4}, [%0], r12                \n"

+    "vld1.8     {d5}, [%1], r12                \n"

+    "vld1.8     {d6}, [%0], r12                \n"

+    "vld1.8     {d7}, [%1], r12                \n"

+    "vaddl.u8   q0, d0, d1                     \n"

+    "vaddl.u8   q1, d2, d3                     \n"

+    "vaddl.u8   q2, d4, d5                     \n"

+    "vaddl.u8   q3, d6, d7                     \n"

+    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd

+    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh

+    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)

+    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)

+    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.

+    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.

+    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.

+    "vst1.8     {q0}, [%2]!                    \n"

+    "bgt        1b                             \n"

+  : "+r"(src_argb),    // %0

+    "+r"(src_stride),  // %1

+    "+r"(dst_argb),    // %2

+    "+r"(dst_width)    // %3

+  : "r"(src_stepx)     // %4

+  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"

+  );

+}

+#endif  // __ARM_NEON__

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/scale_posix.cc

@@ -1,0 +1,1315 @@

+/*

+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for GCC x86 and x64.

+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))

+// Offsets for source bytes 0 to 9

+static uvec8 kShuf0 =

+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

+static uvec8 kShuf1 =

+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

+static uvec8 kShuf2 =

+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Offsets for source bytes 0 to 10

+static uvec8 kShuf01 =

+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

+static uvec8 kShuf11 =

+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

+static uvec8 kShuf21 =

+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

+// Coefficients for source bytes 0 to 10

+static uvec8 kMadd01 =

+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

+// Coefficients for source bytes 10 to 21

+static uvec8 kMadd11 =

+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

+// Coefficients for source bytes 21 to 31

+static uvec8 kMadd21 =

+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

+// Coefficients for source bytes 21 to 31

+static vec16 kRound34 =

+  { 2, 2, 2, 2, 2, 2, 2, 2 };

+static uvec8 kShuf38a =

+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+static uvec8 kShuf38b =

+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

+// Arrange words 0,3,6 into 0,1,2

+static uvec8 kShufAc =

+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Arrange words 0,3,6 into 3,4,5

+static uvec8 kShufAc3 =

+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

+// Scaling values for boxes of 3x3 and 2x3

+static uvec16 kScaleAc33 =

+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

+// Arrange first value for pixels 0,1,2,3,4,5

+static uvec8 kShufAb0 =

+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

+// Arrange second value for pixels 0,1,2,3,4,5

+static uvec8 kShufAb1 =

+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

+// Arrange third value for pixels 0,1,2,3,4,5

+static uvec8 kShufAb2 =

+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

+// Scaling values for boxes of 3x2 and 2x2

+static uvec16 kScaleAb2 =

+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

+// GCC versions of row functions are verbatim conversions from Visual C.

+// Generated using gcc disassembly on Visual C object file:

+// objdump -D yuvscaler.obj >yuvscaler.txt

+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "movdqa    %%xmm1,%%xmm3                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "pand      %%xmm5,%%xmm2                   \n"

+    "pand      %%xmm5,%%xmm3                   \n"

+    "pavgw     %%xmm2,%%xmm0                   \n"

+    "pavgw     %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa  (%0,%3,1),%%xmm2

+    BUNDLEALIGN

+    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa  0x10(%0,%3,1),%%xmm3

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "movdqa    %%xmm1,%%xmm3                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "pand      %%xmm5,%%xmm2                   \n"

+    "pand      %%xmm5,%%xmm3                   \n"

+    "pavgw     %%xmm2,%%xmm0                   \n"

+    "pavgw     %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  : "r"((intptr_t)(src_stride))   // %3

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                                  uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,

+                                        ptrdiff_t src_stride,

+                                        uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "movdqa    %%xmm1,%%xmm3                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "pand      %%xmm5,%%xmm2                   \n"

+    "pand      %%xmm5,%%xmm3                   \n"

+    "pavgw     %%xmm2,%%xmm0                   \n"

+    "pavgw     %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,

+                                     ptrdiff_t src_stride,

+                                     uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $0x8,%%xmm5                     \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2

+    BUNDLEALIGN

+    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "movdqa    %%xmm1,%%xmm3                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "pand      %%xmm5,%%xmm2                   \n"

+    "pand      %%xmm5,%%xmm3                   \n"

+    "pavgw     %%xmm2,%%xmm0                   \n"

+    "pavgw     %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "sub       $0x10,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  : "r"((intptr_t)(src_stride))   // %3

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+#endif

+  );

+}

+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrld     $0x18,%%xmm5                    \n"

+    "pslld     $0x10,%%xmm5                    \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pand      %%xmm5,%%xmm0                   \n"

+    "pand      %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm5"

+#endif

+  );

+}

+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width) {

+  intptr_t stridex3 = 0;

+  asm volatile (

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"

+    "psrlw     $0x8,%%xmm7                     \n"

+    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2

+    BUNDLEALIGN

+    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    MEMOPREG(movdqa,0x00,0,4,2,xmm2)           //  movdqa  (%0,%4,2),%%xmm2

+    BUNDLEALIGN

+    MEMOPREG(movdqa,0x10,0,4,2,xmm3)           //  movdqa  0x10(%0,%4,2),%%xmm3

+    MEMOPREG(movdqa,0x00,0,3,1,xmm4)           //  movdqa  (%0,%3,1),%%xmm4

+    MEMOPREG(movdqa,0x10,0,3,1,xmm5)           //  movdqa  0x10(%0,%3,1),%%xmm5

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm4,%%xmm2                   \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm5,%%xmm3                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "movdqa    %%xmm1,%%xmm3                   \n"

+    "psrlw     $0x8,%%xmm1                     \n"

+    "pand      %%xmm7,%%xmm2                   \n"

+    "pand      %%xmm7,%%xmm3                   \n"

+    "pavgw     %%xmm2,%%xmm0                   \n"

+    "pavgw     %%xmm3,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm0                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "psrlw     $0x8,%%xmm0                     \n"

+    "pand      %%xmm7,%%xmm2                   \n"

+    "pavgw     %%xmm2,%%xmm0                   \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x8,1) ",%1            \n"

+    "sub       $0x8,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),     // %0

+    "+r"(dst_ptr),     // %1

+    "+r"(dst_width),   // %2

+    "+r"(stridex3)     // %3

+  : "r"((intptr_t)(src_stride))    // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"

+#endif

+  );

+}

+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "movdqa    %0,%%xmm3                       \n"

+    "movdqa    %1,%%xmm4                       \n"

+    "movdqa    %2,%%xmm5                       \n"

+  :

+  : "m"(kShuf0),  // %0

+    "m"(kShuf1),  // %1

+    "m"(kShuf2)   // %2

+  );

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm2   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "movdqa    %%xmm2,%%xmm1                   \n"

+    "palignr   $0x8,%%xmm0,%%xmm1              \n"

+    "pshufb    %%xmm3,%%xmm0                   \n"

+    "pshufb    %%xmm4,%%xmm1                   \n"

+    "pshufb    %%xmm5,%%xmm2                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"

+    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x18,1) ",%1           \n"

+    "sub       $0x18,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),   // %0

+    "+r"(dst_ptr),   // %1

+    "+r"(dst_width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+#endif

+  );

+}

+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "movdqa    %0,%%xmm2                       \n"  // kShuf01

+    "movdqa    %1,%%xmm3                       \n"  // kShuf11

+    "movdqa    %2,%%xmm4                       \n"  // kShuf21

+  :

+  : "m"(kShuf01),  // %0

+    "m"(kShuf11),  // %1

+    "m"(kShuf21)   // %2

+  );

+  asm volatile (

+    "movdqa    %0,%%xmm5                       \n"  // kMadd01

+    "movdqa    %1,%%xmm0                       \n"  // kMadd11

+    "movdqa    %2,%%xmm1                       \n"  // kRound34

+  :

+  : "m"(kMadd01),  // %0

+    "m"(kMadd11),  // %1

+    "m"(kRound34)  // %2

+  );

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"

+    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3),%%xmm7

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "pshufb    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm5,%%xmm6                   \n"

+    "paddsw    %%xmm1,%%xmm6                   \n"

+    "psrlw     $0x2,%%xmm6                     \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "movq      %%xmm6," MEMACCESS(1) "         \n"

+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"

+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "pshufb    %%xmm3,%%xmm6                   \n"

+    "pmaddubsw %%xmm0,%%xmm6                   \n"

+    "paddsw    %%xmm1,%%xmm6                   \n"

+    "psrlw     $0x2,%%xmm6                     \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3),%%xmm7

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "pshufb    %%xmm4,%%xmm6                   \n"

+    "pmaddubsw %4,%%xmm6                       \n"

+    "paddsw    %%xmm1,%%xmm6                   \n"

+    "psrlw     $0x2,%%xmm6                     \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x18,1) ",%1           \n"

+    "sub       $0x18,%2                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),   // %0

+    "+r"(dst_ptr),   // %1

+    "+r"(dst_width)  // %2

+  : "r"((intptr_t)(src_stride)),  // %3

+    "m"(kMadd21)     // %4

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "movdqa    %0,%%xmm2                       \n"  // kShuf01

+    "movdqa    %1,%%xmm3                       \n"  // kShuf11

+    "movdqa    %2,%%xmm4                       \n"  // kShuf21

+  :

+  : "m"(kShuf01),  // %0

+    "m"(kShuf11),  // %1

+    "m"(kShuf21)   // %2

+  );

+  asm volatile (

+    "movdqa    %0,%%xmm5                       \n"  // kMadd01

+    "movdqa    %1,%%xmm0                       \n"  // kMadd11

+    "movdqa    %2,%%xmm1                       \n"  // kRound34

+  :

+  : "m"(kMadd01),  // %0

+    "m"(kMadd11),  // %1

+    "m"(kRound34)  // %2

+  );

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"

+    MEMOPREG(movdqa,0x00,0,3,1,xmm7)           //  movdqa  (%0,%3,1),%%xmm7

+    "pavgb     %%xmm6,%%xmm7                   \n"

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "pshufb    %%xmm2,%%xmm6                   \n"

+    "pmaddubsw %%xmm5,%%xmm6                   \n"

+    "paddsw    %%xmm1,%%xmm6                   \n"

+    "psrlw     $0x2,%%xmm6                     \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "movq      %%xmm6," MEMACCESS(1) "         \n"

+    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"

+    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7

+    "pavgb     %%xmm6,%%xmm7                   \n"

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "pshufb    %%xmm3,%%xmm6                   \n"

+    "pmaddubsw %%xmm0,%%xmm6                   \n"

+    "paddsw    %%xmm1,%%xmm6                   \n"

+    "psrlw     $0x2,%%xmm6                     \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

+    MEMOPREG(movdqa,0x10,0,3,1,xmm7)           //  movdqa  0x10(%0,%3,1),%%xmm7

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm6,%%xmm7                   \n"

+    "pavgb     %%xmm7,%%xmm6                   \n"

+    "pshufb    %%xmm4,%%xmm6                   \n"

+    "pmaddubsw %4,%%xmm6                       \n"

+    "paddsw    %%xmm1,%%xmm6                   \n"

+    "psrlw     $0x2,%%xmm6                     \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x18,1) ",%1           \n"

+    "sub       $0x18,%2                        \n"

+    "jg        1b                              \n"

+    : "+r"(src_ptr),   // %0

+      "+r"(dst_ptr),   // %1

+      "+r"(dst_width)  // %2

+    : "r"((intptr_t)(src_stride)),  // %3

+      "m"(kMadd21)     // %4

+    : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "movdqa    %3,%%xmm4                       \n"

+    "movdqa    %4,%%xmm5                       \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pshufb    %%xmm4,%%xmm0                   \n"

+    "pshufb    %%xmm5,%%xmm1                   \n"

+    "paddusb   %%xmm1,%%xmm0                   \n"

+    "movq      %%xmm0," MEMACCESS(1) "         \n"

+    "movhlps   %%xmm0,%%xmm1                   \n"

+    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"

+    "lea       " MEMLEA(0xc,1) ",%1            \n"

+    "sub       $0xc,%2                         \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),   // %0

+    "+r"(dst_ptr),   // %1

+    "+r"(dst_width)  // %2

+  : "m"(kShuf38a),   // %3

+    "m"(kShuf38b)    // %4

+  : "memory", "cc"

+#if defined(__SSE2__)

+      , "xmm0", "xmm1", "xmm4", "xmm5"

+#endif

+  );

+}

+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "movdqa    %0,%%xmm2                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm4                       \n"

+    "movdqa    %3,%%xmm5                       \n"

+  :

+  : "m"(kShufAb0),   // %0

+    "m"(kShufAb1),   // %1

+    "m"(kShufAb2),   // %2

+    "m"(kScaleAb2)   // %3

+  );

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb   (%0,%3,1),%%xmm0

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "pshufb    %%xmm2,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm6                   \n"

+    "pshufb    %%xmm3,%%xmm6                   \n"

+    "paddusw   %%xmm6,%%xmm1                   \n"

+    "pshufb    %%xmm4,%%xmm0                   \n"

+    "paddusw   %%xmm0,%%xmm1                   \n"

+    "pmulhuw   %%xmm5,%%xmm1                   \n"

+    "packuswb  %%xmm1,%%xmm1                   \n"

+    "sub       $0x6,%2                         \n"

+    "movd      %%xmm1," MEMACCESS(1) "         \n"

+    "psrlq     $0x10,%%xmm1                    \n"

+    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"

+    "lea       " MEMLEA(0x6,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),     // %0

+    "+r"(dst_ptr),     // %1

+    "+r"(dst_width)    // %2

+  : "r"((intptr_t)(src_stride))  // %3

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  asm volatile (

+    "movdqa    %0,%%xmm2                       \n"

+    "movdqa    %1,%%xmm3                       \n"

+    "movdqa    %2,%%xmm4                       \n"

+    "pxor      %%xmm5,%%xmm5                   \n"

+  :

+  : "m"(kShufAc),    // %0

+    "m"(kShufAc3),   // %1

+    "m"(kScaleAc33)  // %2

+  );

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(movdqa,0x00,0,3,1,xmm6)           //  movdqa  (%0,%3,1),%%xmm6

+    "movhlps   %%xmm0,%%xmm1                   \n"

+    "movhlps   %%xmm6,%%xmm7                   \n"

+    "punpcklbw %%xmm5,%%xmm0                   \n"

+    "punpcklbw %%xmm5,%%xmm1                   \n"

+    "punpcklbw %%xmm5,%%xmm6                   \n"

+    "punpcklbw %%xmm5,%%xmm7                   \n"

+    "paddusw   %%xmm6,%%xmm0                   \n"

+    "paddusw   %%xmm7,%%xmm1                   \n"

+    MEMOPREG(movdqa,0x00,0,3,2,xmm6)           //  movdqa  (%0,%3,2),%%xmm6

+    "lea       " MEMLEA(0x10,0) ",%0           \n"

+    "movhlps   %%xmm6,%%xmm7                   \n"

+    "punpcklbw %%xmm5,%%xmm6                   \n"

+    "punpcklbw %%xmm5,%%xmm7                   \n"

+    "paddusw   %%xmm6,%%xmm0                   \n"

+    "paddusw   %%xmm7,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm6                   \n"

+    "psrldq    $0x2,%%xmm0                     \n"

+    "paddusw   %%xmm0,%%xmm6                   \n"

+    "psrldq    $0x2,%%xmm0                     \n"

+    "paddusw   %%xmm0,%%xmm6                   \n"

+    "pshufb    %%xmm2,%%xmm6                   \n"

+    "movdqa    %%xmm1,%%xmm7                   \n"

+    "psrldq    $0x2,%%xmm1                     \n"

+    "paddusw   %%xmm1,%%xmm7                   \n"

+    "psrldq    $0x2,%%xmm1                     \n"

+    "paddusw   %%xmm1,%%xmm7                   \n"

+    "pshufb    %%xmm3,%%xmm7                   \n"

+    "paddusw   %%xmm7,%%xmm6                   \n"

+    "pmulhuw   %%xmm4,%%xmm6                   \n"

+    "packuswb  %%xmm6,%%xmm6                   \n"

+    "sub       $0x6,%2                         \n"

+    "movd      %%xmm6," MEMACCESS(1) "         \n"

+    "psrlq     $0x10,%%xmm6                    \n"

+    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"

+    "lea       " MEMLEA(0x6,1) ",%1            \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),    // %0

+    "+r"(dst_ptr),    // %1

+    "+r"(dst_width)   // %2

+  : "r"((intptr_t)(src_stride))   // %3

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+#endif

+  );

+}

+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                       uint16* dst_ptr, int src_width, int src_height) {

+  int tmp_height = 0;

+  intptr_t tmp_src = 0;

+  asm volatile (

+    "pxor      %%xmm4,%%xmm4                   \n"

+    "sub       $0x1,%5                         \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "mov       %0,%3                           \n"

+    "add       %6,%0                           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm4,%%xmm0                   \n"

+    "punpckhbw %%xmm4,%%xmm1                   \n"

+    "mov       %5,%2                           \n"

+    "test      %2,%2                           \n"

+    "je        3f                              \n"

+    LABELALIGN

+  "2:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"

+    "add       %6,%0                           \n"

+    "movdqa    %%xmm2,%%xmm3                   \n"

+    "punpcklbw %%xmm4,%%xmm2                   \n"

+    "punpckhbw %%xmm4,%%xmm3                   \n"

+    "paddusw   %%xmm2,%%xmm0                   \n"

+    "paddusw   %%xmm3,%%xmm1                   \n"

+    "sub       $0x1,%2                         \n"

+    "jg        2b                              \n"

+    LABELALIGN

+  "3:                                          \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

+    "lea       " MEMLEA(0x10,3) ",%0           \n"

+    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "sub       $0x10,%4                        \n"

+    "jg        1b                              \n"

+  : "+r"(src_ptr),     // %0

+    "+r"(dst_ptr),     // %1

+    "+r"(tmp_height),  // %2

+    "+r"(tmp_src),     // %3

+    "+r"(src_width),   // %4

+    "+rm"(src_height)  // %5

+  : "rm"((intptr_t)(src_stride))  // %6

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

+#endif

+  );

+}

+// Bilinear column filtering. SSSE3 version.

+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                           int dst_width, int x, int dx) {

+  intptr_t x0 = 0, x1 = 0, temp_pixel = 0;

+  asm volatile (

+    "movd      %6,%%xmm2                       \n"

+    "movd      %7,%%xmm3                       \n"

+    "movl      $0x04040000,%k2                 \n"

+    "movd      %k2,%%xmm5                      \n"

+    "pcmpeqb   %%xmm6,%%xmm6                   \n"

+    "psrlw     $0x9,%%xmm6                     \n"

+    "pextrw    $0x1,%%xmm2,%k3                 \n"

+    "subl      $0x2,%5                         \n"

+    "jl        29f                             \n"

+    "movdqa    %%xmm2,%%xmm0                   \n"

+    "paddd     %%xmm3,%%xmm0                   \n"

+    "punpckldq %%xmm0,%%xmm2                   \n"

+    "punpckldq %%xmm3,%%xmm3                   \n"

+    "paddd     %%xmm3,%%xmm3                   \n"

+    "pextrw    $0x3,%%xmm2,%k4                 \n"

+    LABELALIGN

+  "2:                                          \n"

+    "movdqa    %%xmm2,%%xmm1                   \n"

+    "paddd     %%xmm3,%%xmm2                   \n"

+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2

+    "movd      %k2,%%xmm0                      \n"

+    "psrlw     $0x9,%%xmm1                     \n"

+    BUNDLEALIGN

+    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2

+    "movd      %k2,%%xmm4                      \n"

+    "pshufb    %%xmm5,%%xmm1                   \n"

+    "punpcklwd %%xmm4,%%xmm0                   \n"

+    "pxor      %%xmm6,%%xmm1                   \n"

+    "pmaddubsw %%xmm1,%%xmm0                   \n"

+    "pextrw    $0x1,%%xmm2,%k3                 \n"

+    "pextrw    $0x3,%%xmm2,%k4                 \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movd      %%xmm0,%k2                      \n"

+    "mov       %w2," MEMACCESS(0) "            \n"

+    "lea       " MEMLEA(0x2,0) ",%0            \n"

+    "sub       $0x2,%5                         \n"

+    "jge       2b                              \n"

+    LABELALIGN

+  "29:                                         \n"

+    "addl      $0x1,%5                         \n"

+    "jl        99f                             \n"

+    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2

+    "movd      %k2,%%xmm0                      \n"

+    "psrlw     $0x9,%%xmm2                     \n"

+    "pshufb    %%xmm5,%%xmm2                   \n"

+    "pxor      %%xmm6,%%xmm2                   \n"

+    "pmaddubsw %%xmm2,%%xmm0                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movd      %%xmm0,%k2                      \n"

+    "mov       %b2," MEMACCESS(0) "            \n"

+  "99:                                         \n"

+  : "+r"(dst_ptr),     // %0

+    "+r"(src_ptr),     // %1

+    "+a"(temp_pixel),  // %2

+    "+r"(x0),          // %3

+    "+r"(x1),          // %4

+    "+rm"(dst_width)   // %5

+  : "rm"(x),           // %6

+    "rm"(dx)           // %7

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+// Reads 4 pixels, duplicates them and writes 8 pixels.

+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                       int dst_width, int x, int dx) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpcklbw %%xmm0,%%xmm0                   \n"

+    "punpckhbw %%xmm1,%%xmm1                   \n"

+    "sub       $0x20,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "jg        1b                              \n"

+  : "+r"(dst_ptr),     // %0

+    "+r"(src_ptr),     // %1

+    "+r"(dst_width)    // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

+                            ptrdiff_t src_stride,

+                            uint8* dst_argb, int dst_width) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "shufps    $0xdd,%%xmm1,%%xmm0             \n"

+    "sub       $0x4,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(dst_width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

+                                  ptrdiff_t src_stride,

+                                  uint8* dst_argb, int dst_width) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "sub       $0x4,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),  // %0

+    "+r"(dst_argb),  // %1

+    "+r"(dst_width)  // %2

+  :

+  : "memory", "cc"

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

+                               ptrdiff_t src_stride,

+                               uint8* dst_argb, int dst_width) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

+    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

+    BUNDLEALIGN

+    MEMOPREG(movdqa,0x00,0,3,1,xmm2)           //  movdqa   (%0,%3,1),%%xmm2

+    MEMOPREG(movdqa,0x10,0,3,1,xmm3)           //  movdqa   0x10(%0,%3,1),%%xmm3

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "sub       $0x4,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),   // %0

+    "+r"(dst_argb),   // %1

+    "+r"(dst_width)   // %2

+  : "r"((intptr_t)(src_stride))   // %3

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3"

+#endif

+  );

+}

+// Reads 4 pixels at a time.

+// Alignment requirement: dst_argb 16 byte aligned.

+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8* dst_argb, int dst_width) {

+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

+  intptr_t src_stepx_x12 = 0;

+  asm volatile (

+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movd      " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1

+    "punpckldq %%xmm1,%%xmm0                   \n"

+    BUNDLEALIGN

+    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2

+    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3

+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"

+    "punpckldq %%xmm3,%%xmm2                   \n"

+    "punpcklqdq %%xmm2,%%xmm0                  \n"

+    "sub       $0x4,%3                         \n"

+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),      // %0

+    "+r"(src_stepx_x4),  // %1

+    "+r"(dst_argb),      // %2

+    "+r"(dst_width),     // %3

+    "+r"(src_stepx_x12)  // %4

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3"

+#endif

+  );

+}

+// Blends four 2x2 to 4x1.

+// Alignment requirement: dst_argb 16 byte aligned.

+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

+                                  ptrdiff_t src_stride, int src_stepx,

+                                  uint8* dst_argb, int dst_width) {

+  intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

+  intptr_t src_stepx_x12 = 0;

+  intptr_t row1 = (intptr_t)(src_stride);

+  asm volatile (

+    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

+    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"

+    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"

+    LABELALIGN

+  "1:                                          \n"

+    "movq      " MEMACCESS(0) ",%%xmm0         \n"

+    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0

+    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1

+    BUNDLEALIGN

+    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1

+    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"

+    "movq      " MEMACCESS(5) ",%%xmm2         \n"

+    BUNDLEALIGN

+    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2

+    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3

+    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3

+    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "pavgb     %%xmm3,%%xmm1                   \n"

+    "movdqa    %%xmm0,%%xmm2                   \n"

+    "shufps    $0x88,%%xmm1,%%xmm0             \n"

+    "shufps    $0xdd,%%xmm1,%%xmm2             \n"

+    "pavgb     %%xmm2,%%xmm0                   \n"

+    "sub       $0x4,%3                         \n"

+    "movdqa    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jg        1b                              \n"

+  : "+r"(src_argb),       // %0

+    "+r"(src_stepx_x4),   // %1

+    "+r"(dst_argb),       // %2

+    "+rm"(dst_width),     // %3

+    "+r"(src_stepx_x12),  // %4

+    "+r"(row1)            // %5

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3"

+#endif

+  );

+}

+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

+                        int dst_width, int x, int dx) {

+  intptr_t x0 = 0, x1 = 0;

+  asm volatile (

+    "movd      %5,%%xmm2                       \n"

+    "movd      %6,%%xmm3                       \n"

+    "pshufd    $0x0,%%xmm2,%%xmm2              \n"

+    "pshufd    $0x11,%%xmm3,%%xmm0             \n"

+    "paddd     %%xmm0,%%xmm2                   \n"

+    "paddd     %%xmm3,%%xmm3                   \n"

+    "pshufd    $0x5,%%xmm3,%%xmm0              \n"

+    "paddd     %%xmm0,%%xmm2                   \n"

+    "paddd     %%xmm3,%%xmm3                   \n"

+    "pshufd    $0x0,%%xmm3,%%xmm3              \n"

+    "pextrw    $0x1,%%xmm2,%k0                 \n"

+    "pextrw    $0x3,%%xmm2,%k1                 \n"

+    "cmp       $0x0,%4                         \n"

+    "jl        99f                             \n"

+    "sub       $0x4,%4                         \n"

+    "jl        49f                             \n"

+    LABELALIGN

+  "40:                                         \n"

+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1

+    "pextrw    $0x5,%%xmm2,%k0                 \n"

+    "pextrw    $0x7,%%xmm2,%k1                 \n"

+    "paddd     %%xmm3,%%xmm2                   \n"

+    "punpckldq %%xmm1,%%xmm0                   \n"

+    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1

+    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4

+    "pextrw    $0x1,%%xmm2,%k0                 \n"

+    "pextrw    $0x3,%%xmm2,%k1                 \n"

+    "punpckldq %%xmm4,%%xmm1                   \n"

+    "punpcklqdq %%xmm1,%%xmm0                  \n"

+    "sub       $0x4,%4                         \n"

+    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x10,2) ",%2           \n"

+    "jge       40b                             \n"

+  "49:                                         \n"

+    "test      $0x2,%4                         \n"

+    "je        29f                             \n"

+    BUNDLEALIGN

+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

+    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1

+    "pextrw    $0x5,%%xmm2,%k0                 \n"

+    "punpckldq %%xmm1,%%xmm0                   \n"

+    "movq      %%xmm0," MEMACCESS(2) "         \n"

+    "lea       " MEMLEA(0x8,2) ",%2            \n"

+  "29:                                         \n"

+    "test      $0x1,%4                         \n"

+    "je        99f                             \n"

+    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

+    "movd      %%xmm0," MEMACCESS(2) "         \n"

+  "99:                                         \n"

+  : "+a"(x0),          // %0

+    "+d"(x1),          // %1

+    "+r"(dst_argb),    // %2

+    "+r"(src_argb),    // %3

+    "+r"(dst_width)    // %4

+  : "rm"(x),           // %5

+    "rm"(dx)           // %6

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

+#endif

+  );

+}

+// Reads 4 pixels, duplicates them and writes 8 pixels.

+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

+                           int dst_width, int x, int dx) {

+  asm volatile (

+    LABELALIGN

+  "1:                                          \n"

+    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"

+    "lea       " MEMLEA(0x10,1) ",%1           \n"

+    "movdqa    %%xmm0,%%xmm1                   \n"

+    "punpckldq %%xmm0,%%xmm0                   \n"

+    "punpckhdq %%xmm1,%%xmm1                   \n"

+    "sub       $0x8,%2                         \n"

+    "movdqa    %%xmm0," MEMACCESS(0) "         \n"

+    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"

+    "lea       " MEMLEA(0x20,0) ",%0           \n"

+    "jg        1b                              \n"

+  : "+r"(dst_argb),    // %0

+    "+r"(src_argb),    // %1

+    "+r"(dst_width)    // %2

+  :

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1"

+#endif

+  );

+}

+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw

+static uvec8 kShuffleColARGB = {

+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel

+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

+};

+// Shuffle table for duplicating 2 fractions into 8 bytes each

+static uvec8 kShuffleFractions = {

+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

+};

+// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version

+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

+                               int dst_width, int x, int dx) {

+  intptr_t x0 = 0, x1 = 0;

+  asm volatile (

+    "movdqa    %0,%%xmm4                       \n"

+    "movdqa    %1,%%xmm5                       \n"

+  :

+  : "m"(kShuffleColARGB),  // %0

+    "m"(kShuffleFractions)  // %1

+  );

+  asm volatile (

+    "movd      %5,%%xmm2                       \n"

+    "movd      %6,%%xmm3                       \n"

+    "pcmpeqb   %%xmm6,%%xmm6                   \n"

+    "psrlw     $0x9,%%xmm6                     \n"

+    "pextrw    $0x1,%%xmm2,%k3                 \n"

+    "sub       $0x2,%2                         \n"

+    "jl        29f                             \n"

+    "movdqa    %%xmm2,%%xmm0                   \n"

+    "paddd     %%xmm3,%%xmm0                   \n"

+    "punpckldq %%xmm0,%%xmm2                   \n"

+    "punpckldq %%xmm3,%%xmm3                   \n"

+    "paddd     %%xmm3,%%xmm3                   \n"

+    "pextrw    $0x3,%%xmm2,%k4                 \n"

+    LABELALIGN

+  "2:                                          \n"

+    "movdqa    %%xmm2,%%xmm1                   \n"

+    "paddd     %%xmm3,%%xmm2                   \n"

+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0

+    "psrlw     $0x9,%%xmm1                     \n"

+    BUNDLEALIGN

+    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0

+    "pshufb    %%xmm5,%%xmm1                   \n"

+    "pshufb    %%xmm4,%%xmm0                   \n"

+    "pxor      %%xmm6,%%xmm1                   \n"

+    "pmaddubsw %%xmm1,%%xmm0                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "pextrw    $0x1,%%xmm2,%k3                 \n"

+    "pextrw    $0x3,%%xmm2,%k4                 \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movq      %%xmm0," MEMACCESS(0) "         \n"

+    "lea       " MEMLEA(0x8,0) ",%0            \n"

+    "sub       $0x2,%2                         \n"

+    "jge       2b                              \n"

+    LABELALIGN

+  "29:                                         \n"

+    "add       $0x1,%2                         \n"

+    "jl        99f                             \n"

+    "psrlw     $0x9,%%xmm2                     \n"

+    BUNDLEALIGN

+    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0

+    "pshufb    %%xmm5,%%xmm2                   \n"

+    "pshufb    %%xmm4,%%xmm0                   \n"

+    "pxor      %%xmm6,%%xmm2                   \n"

+    "pmaddubsw %%xmm2,%%xmm0                   \n"

+    "psrlw     $0x7,%%xmm0                     \n"

+    "packuswb  %%xmm0,%%xmm0                   \n"

+    "movd      %%xmm0," MEMACCESS(0) "         \n"

+    LABELALIGN

+  "99:                                         \n"

+  : "+r"(dst_argb),    // %0

+    "+r"(src_argb),    // %1

+    "+rm"(dst_width),  // %2

+    "+r"(x0),          // %3

+    "+r"(x1)           // %4

+  : "rm"(x),           // %5

+    "rm"(dx)           // %6

+  : "memory", "cc"

+#if defined(__native_client__) && defined(__x86_64__)

+    , "r14"

+#endif

+#if defined(__SSE2__)

+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

+#endif

+  );

+}

+// Divide num by div and return as 16.16 fixed point result.

+int FixedDiv_X86(int num, int div) {

+  asm volatile (

+    "cdq                                       \n"

+    "shld      $0x10,%%eax,%%edx               \n"

+    "shl       $0x10,%%eax                     \n"

+    "idiv      %1                              \n"

+    "mov       %0, %%eax                       \n"

+    : "+a"(num)  // %0

+    : "c"(div)   // %1

+    : "memory", "cc", "edx"

+  );

+  return num;

+}

+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.

+int FixedDiv1_X86(int num, int div) {

+  asm volatile (

+    "cdq                                       \n"

+    "shld      $0x10,%%eax,%%edx               \n"

+    "shl       $0x10,%%eax                     \n"

+    "sub       $0x10001,%%eax                  \n"

+    "sbb       $0x0,%%edx                      \n"

+    "sub       $0x1,%1                         \n"

+    "idiv      %1                              \n"

+    "mov       %0, %%eax                       \n"

+    : "+a"(num)  // %0

+    : "c"(div)   // %1

+    : "memory", "cc", "edx"

+  );

+  return num;

+}

+#endif  // defined(__x86_64__) || defined(__i386__)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- /dev/null

+++ b/third_party/libyuv/source/scale_win.cc

@@ -1,0 +1,1320 @@

+/*

+ *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "third_party/libyuv/include/libyuv/row.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+// This module is for Visual C x86.

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+// Offsets for source bytes 0 to 9

+static uvec8 kShuf0 =

+  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

+static uvec8 kShuf1 =

+  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

+static uvec8 kShuf2 =

+  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Offsets for source bytes 0 to 10

+static uvec8 kShuf01 =

+  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

+// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

+static uvec8 kShuf11 =

+  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

+// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

+static uvec8 kShuf21 =

+  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

+// Coefficients for source bytes 0 to 10

+static uvec8 kMadd01 =

+  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

+// Coefficients for source bytes 10 to 21

+static uvec8 kMadd11 =

+  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

+// Coefficients for source bytes 21 to 31

+static uvec8 kMadd21 =

+  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

+// Coefficients for source bytes 21 to 31

+static vec16 kRound34 =

+  { 2, 2, 2, 2, 2, 2, 2, 2 };

+static uvec8 kShuf38a =

+  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+static uvec8 kShuf38b =

+  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

+// Arrange words 0,3,6 into 0,1,2

+static uvec8 kShufAc =

+  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+// Arrange words 0,3,6 into 3,4,5

+static uvec8 kShufAc3 =

+  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

+// Scaling values for boxes of 3x3 and 2x3

+static uvec16 kScaleAc33 =

+  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

+// Arrange first value for pixels 0,1,2,3,4,5

+static uvec8 kShufAb0 =

+  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

+// Arrange second value for pixels 0,1,2,3,4,5

+static uvec8 kShufAb1 =

+  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

+// Arrange third value for pixels 0,1,2,3,4,5

+static uvec8 kShufAb2 =

+  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

+// Scaling values for boxes of 3x2 and 2x2

+static uvec16 kScaleAb2 =

+  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

+// Reads 32 pixels, throws half away and writes 16 pixels.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_ptr

+                                     // src_stride ignored

+    mov        edx, [esp + 12]       // dst_ptr

+    mov        ecx, [esp + 16]       // dst_width

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    psrlw      xmm0, 8               // isolate odd pixels.

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    ret

+  }

+}

+// Blends 32x1 rectangle to 16x1.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                              uint8* dst_ptr, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_ptr

+                                     // src_stride

+    mov        edx, [esp + 12]       // dst_ptr

+    mov        ecx, [esp + 16]       // dst_width

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

+    psrlw      xmm0, 8

+    movdqa     xmm3, xmm1

+    psrlw      xmm1, 8

+    pand       xmm2, xmm5

+    pand       xmm3, xmm5

+    pavgw      xmm0, xmm2

+    pavgw      xmm1, xmm3

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    ret

+  }

+}

+// Blends 32x2 rectangle to 16x1.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]    // src_ptr

+    mov        esi, [esp + 4 + 8]    // src_stride

+    mov        edx, [esp + 4 + 12]   // dst_ptr

+    mov        ecx, [esp + 4 + 16]   // dst_width

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + esi]

+    movdqa     xmm3, [eax + esi + 16]

+    lea        eax,  [eax + 32]

+    pavgb      xmm0, xmm2            // average rows

+    pavgb      xmm1, xmm3

+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

+    psrlw      xmm0, 8

+    movdqa     xmm3, xmm1

+    psrlw      xmm1, 8

+    pand       xmm2, xmm5

+    pand       xmm3, xmm5

+    pavgw      xmm0, xmm2

+    pavgw      xmm1, xmm3

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    pop        esi

+    ret

+  }

+}

+// Reads 32 pixels, throws half away and writes 16 pixels.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8* dst_ptr, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_ptr

+                                     // src_stride ignored

+    mov        edx, [esp + 12]       // dst_ptr

+    mov        ecx, [esp + 16]       // dst_width

+    align      4

+  wloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    psrlw      xmm0, 8               // isolate odd pixels.

+    psrlw      xmm1, 8

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    ret

+  }

+}

+// Blends 32x1 rectangle to 16x1.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,

+                                        ptrdiff_t src_stride,

+                                        uint8* dst_ptr, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_ptr

+                                     // src_stride

+    mov        edx, [esp + 12]       // dst_ptr

+    mov        ecx, [esp + 16]       // dst_width

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    align      4

+  wloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

+    psrlw      xmm0, 8

+    movdqa     xmm3, xmm1

+    psrlw      xmm1, 8

+    pand       xmm2, xmm5

+    pand       xmm3, xmm5

+    pavgw      xmm0, xmm2

+    pavgw      xmm1, xmm3

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    ret

+  }

+}

+// Blends 32x2 rectangle to 16x1.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,

+                                     ptrdiff_t src_stride,

+                                     uint8* dst_ptr, int dst_width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]    // src_ptr

+    mov        esi, [esp + 4 + 8]    // src_stride

+    mov        edx, [esp + 4 + 12]   // dst_ptr

+    mov        ecx, [esp + 4 + 16]   // dst_width

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    psrlw      xmm5, 8

+    align      4

+  wloop:

+    movdqu     xmm0, [eax]

+    movdqu     xmm1, [eax + 16]

+    movdqu     xmm2, [eax + esi]

+    movdqu     xmm3, [eax + esi + 16]

+    lea        eax,  [eax + 32]

+    pavgb      xmm0, xmm2            // average rows

+    pavgb      xmm1, xmm3

+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

+    psrlw      xmm0, 8

+    movdqa     xmm3, xmm1

+    psrlw      xmm1, 8

+    pand       xmm2, xmm5

+    pand       xmm3, xmm5

+    pavgw      xmm0, xmm2

+    pavgw      xmm1, xmm3

+    packuswb   xmm0, xmm1

+    sub        ecx, 16

+    movdqu     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    pop        esi

+    ret

+  }

+}

+// Point samples 32 pixels to 8 pixels.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                        uint8* dst_ptr, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_ptr

+                                     // src_stride ignored

+    mov        edx, [esp + 12]       // dst_ptr

+    mov        ecx, [esp + 16]       // dst_width

+    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000

+    psrld      xmm5, 24

+    pslld      xmm5, 16

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    pand       xmm0, xmm5

+    pand       xmm1, xmm5

+    packuswb   xmm0, xmm1

+    psrlw      xmm0, 8

+    packuswb   xmm0, xmm0

+    sub        ecx, 8

+    movq       qword ptr [edx], xmm0

+    lea        edx, [edx + 8]

+    jg         wloop

+    ret

+  }

+}

+// Blends 32x4 rectangle to 8x1.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                           uint8* dst_ptr, int dst_width) {

+  __asm {

+    push       esi

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_ptr

+    mov        esi, [esp + 8 + 8]    // src_stride

+    mov        edx, [esp + 8 + 12]   // dst_ptr

+    mov        ecx, [esp + 8 + 16]   // dst_width

+    lea        edi, [esi + esi * 2]  // src_stride * 3

+    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff

+    psrlw      xmm7, 8

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + esi]

+    movdqa     xmm3, [eax + esi + 16]

+    pavgb      xmm0, xmm2            // average rows

+    pavgb      xmm1, xmm3

+    movdqa     xmm2, [eax + esi * 2]

+    movdqa     xmm3, [eax + esi * 2 + 16]

+    movdqa     xmm4, [eax + edi]

+    movdqa     xmm5, [eax + edi + 16]

+    lea        eax, [eax + 32]

+    pavgb      xmm2, xmm4

+    pavgb      xmm3, xmm5

+    pavgb      xmm0, xmm2

+    pavgb      xmm1, xmm3

+    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

+    psrlw      xmm0, 8

+    movdqa     xmm3, xmm1

+    psrlw      xmm1, 8

+    pand       xmm2, xmm7

+    pand       xmm3, xmm7

+    pavgw      xmm0, xmm2

+    pavgw      xmm1, xmm3

+    packuswb   xmm0, xmm1

+    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)

+    psrlw      xmm0, 8

+    pand       xmm2, xmm7

+    pavgw      xmm0, xmm2

+    packuswb   xmm0, xmm0

+    sub        ecx, 8

+    movq       qword ptr [edx], xmm0

+    lea        edx, [edx + 8]

+    jg         wloop

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// Point samples 32 pixels to 24 pixels.

+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

+// Then shuffled to do the scaling.

+// Note that movdqa+palign may be better than movdqu.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst_ptr, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_ptr

+                                     // src_stride ignored

+    mov        edx, [esp + 12]       // dst_ptr

+    mov        ecx, [esp + 16]       // dst_width

+    movdqa     xmm3, kShuf0

+    movdqa     xmm4, kShuf1

+    movdqa     xmm5, kShuf2

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    movdqa     xmm2, xmm1

+    palignr    xmm1, xmm0, 8

+    pshufb     xmm0, xmm3

+    pshufb     xmm1, xmm4

+    pshufb     xmm2, xmm5

+    movq       qword ptr [edx], xmm0

+    movq       qword ptr [edx + 8], xmm1

+    movq       qword ptr [edx + 16], xmm2

+    lea        edx, [edx + 24]

+    sub        ecx, 24

+    jg         wloop

+    ret

+  }

+}

+// Blends 32x2 rectangle to 24x1

+// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

+// Then shuffled to do the scaling.

+// Register usage:

+// xmm0 src_row 0

+// xmm1 src_row 1

+// xmm2 shuf 0

+// xmm3 shuf 1

+// xmm4 shuf 2

+// xmm5 madd 0

+// xmm6 madd 1

+// xmm7 kRound34

+// Note that movdqa+palign may be better than movdqu.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]    // src_ptr

+    mov        esi, [esp + 4 + 8]    // src_stride

+    mov        edx, [esp + 4 + 12]   // dst_ptr

+    mov        ecx, [esp + 4 + 16]   // dst_width

+    movdqa     xmm2, kShuf01

+    movdqa     xmm3, kShuf11

+    movdqa     xmm4, kShuf21

+    movdqa     xmm5, kMadd01

+    movdqa     xmm6, kMadd11

+    movdqa     xmm7, kRound34

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]           // pixels 0..7

+    movdqa     xmm1, [eax + esi]

+    pavgb      xmm0, xmm1

+    pshufb     xmm0, xmm2

+    pmaddubsw  xmm0, xmm5

+    paddsw     xmm0, xmm7

+    psrlw      xmm0, 2

+    packuswb   xmm0, xmm0

+    movq       qword ptr [edx], xmm0

+    movdqu     xmm0, [eax + 8]       // pixels 8..15

+    movdqu     xmm1, [eax + esi + 8]

+    pavgb      xmm0, xmm1

+    pshufb     xmm0, xmm3

+    pmaddubsw  xmm0, xmm6

+    paddsw     xmm0, xmm7

+    psrlw      xmm0, 2

+    packuswb   xmm0, xmm0

+    movq       qword ptr [edx + 8], xmm0

+    movdqa     xmm0, [eax + 16]      // pixels 16..23

+    movdqa     xmm1, [eax + esi + 16]

+    lea        eax, [eax + 32]

+    pavgb      xmm0, xmm1

+    pshufb     xmm0, xmm4

+    movdqa     xmm1, kMadd21

+    pmaddubsw  xmm0, xmm1

+    paddsw     xmm0, xmm7

+    psrlw      xmm0, 2

+    packuswb   xmm0, xmm0

+    sub        ecx, 24

+    movq       qword ptr [edx + 16], xmm0

+    lea        edx, [edx + 24]

+    jg         wloop

+    pop        esi

+    ret

+  }

+}

+// Note that movdqa+palign may be better than movdqu.

+// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]    // src_ptr

+    mov        esi, [esp + 4 + 8]    // src_stride

+    mov        edx, [esp + 4 + 12]   // dst_ptr

+    mov        ecx, [esp + 4 + 16]   // dst_width

+    movdqa     xmm2, kShuf01

+    movdqa     xmm3, kShuf11

+    movdqa     xmm4, kShuf21

+    movdqa     xmm5, kMadd01

+    movdqa     xmm6, kMadd11

+    movdqa     xmm7, kRound34

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]           // pixels 0..7

+    movdqa     xmm1, [eax + esi]

+    pavgb      xmm1, xmm0

+    pavgb      xmm0, xmm1

+    pshufb     xmm0, xmm2

+    pmaddubsw  xmm0, xmm5

+    paddsw     xmm0, xmm7

+    psrlw      xmm0, 2

+    packuswb   xmm0, xmm0

+    movq       qword ptr [edx], xmm0

+    movdqu     xmm0, [eax + 8]       // pixels 8..15

+    movdqu     xmm1, [eax + esi + 8]

+    pavgb      xmm1, xmm0

+    pavgb      xmm0, xmm1

+    pshufb     xmm0, xmm3

+    pmaddubsw  xmm0, xmm6

+    paddsw     xmm0, xmm7

+    psrlw      xmm0, 2

+    packuswb   xmm0, xmm0

+    movq       qword ptr [edx + 8], xmm0

+    movdqa     xmm0, [eax + 16]      // pixels 16..23

+    movdqa     xmm1, [eax + esi + 16]

+    lea        eax, [eax + 32]

+    pavgb      xmm1, xmm0

+    pavgb      xmm0, xmm1

+    pshufb     xmm0, xmm4

+    movdqa     xmm1, kMadd21

+    pmaddubsw  xmm0, xmm1

+    paddsw     xmm0, xmm7

+    psrlw      xmm0, 2

+    packuswb   xmm0, xmm0

+    sub        ecx, 24

+    movq       qword ptr [edx + 16], xmm0

+    lea        edx, [edx+24]

+    jg         wloop

+    pop        esi

+    ret

+  }

+}

+// 3/8 point sampler

+// Scale 32 pixels to 12

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

+                          uint8* dst_ptr, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_ptr

+                                     // src_stride ignored

+    mov        edx, [esp + 12]       // dst_ptr

+    mov        ecx, [esp + 16]       // dst_width

+    movdqa     xmm4, kShuf38a

+    movdqa     xmm5, kShuf38b

+    align      4

+  xloop:

+    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5

+    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11

+    lea        eax, [eax + 32]

+    pshufb     xmm0, xmm4

+    pshufb     xmm1, xmm5

+    paddusb    xmm0, xmm1

+    sub        ecx, 12

+    movq       qword ptr [edx], xmm0  // write 12 pixels

+    movhlps    xmm1, xmm0

+    movd       [edx + 8], xmm1

+    lea        edx, [edx + 12]

+    jg         xloop

+    ret

+  }

+}

+// Scale 16x3 pixels to 6x1 with interpolation

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]    // src_ptr

+    mov        esi, [esp + 4 + 8]    // src_stride

+    mov        edx, [esp + 4 + 12]   // dst_ptr

+    mov        ecx, [esp + 4 + 16]   // dst_width

+    movdqa     xmm2, kShufAc

+    movdqa     xmm3, kShufAc3

+    movdqa     xmm4, kScaleAc33

+    pxor       xmm5, xmm5

+    align      4

+  xloop:

+    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1

+    movdqa     xmm6, [eax + esi]

+    movhlps    xmm1, xmm0

+    movhlps    xmm7, xmm6

+    punpcklbw  xmm0, xmm5

+    punpcklbw  xmm1, xmm5

+    punpcklbw  xmm6, xmm5

+    punpcklbw  xmm7, xmm5

+    paddusw    xmm0, xmm6

+    paddusw    xmm1, xmm7

+    movdqa     xmm6, [eax + esi * 2]

+    lea        eax, [eax + 16]

+    movhlps    xmm7, xmm6

+    punpcklbw  xmm6, xmm5

+    punpcklbw  xmm7, xmm5

+    paddusw    xmm0, xmm6

+    paddusw    xmm1, xmm7

+    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6

+    psrldq     xmm0, 2

+    paddusw    xmm6, xmm0

+    psrldq     xmm0, 2

+    paddusw    xmm6, xmm0

+    pshufb     xmm6, xmm2

+    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6

+    psrldq     xmm1, 2

+    paddusw    xmm7, xmm1

+    psrldq     xmm1, 2

+    paddusw    xmm7, xmm1

+    pshufb     xmm7, xmm3

+    paddusw    xmm6, xmm7

+    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6

+    packuswb   xmm6, xmm6

+    sub        ecx, 6

+    movd       [edx], xmm6           // write 6 pixels

+    psrlq      xmm6, 16

+    movd       [edx + 2], xmm6

+    lea        edx, [edx + 6]

+    jg         xloop

+    pop        esi

+    ret

+  }

+}

+// Scale 16x2 pixels to 6x1 with interpolation

+__declspec(naked) __declspec(align(16))

+void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8* dst_ptr, int dst_width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]    // src_ptr

+    mov        esi, [esp + 4 + 8]    // src_stride

+    mov        edx, [esp + 4 + 12]   // dst_ptr

+    mov        ecx, [esp + 4 + 16]   // dst_width

+    movdqa     xmm2, kShufAb0

+    movdqa     xmm3, kShufAb1

+    movdqa     xmm4, kShufAb2

+    movdqa     xmm5, kScaleAb2

+    align      4

+  xloop:

+    movdqa     xmm0, [eax]           // average 2 rows into xmm0

+    pavgb      xmm0, [eax + esi]

+    lea        eax, [eax + 16]

+    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1

+    pshufb     xmm1, xmm2

+    movdqa     xmm6, xmm0

+    pshufb     xmm6, xmm3

+    paddusw    xmm1, xmm6

+    pshufb     xmm0, xmm4

+    paddusw    xmm1, xmm0

+    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2

+    packuswb   xmm1, xmm1

+    sub        ecx, 6

+    movd       [edx], xmm1           // write 6 pixels

+    psrlq      xmm1, 16

+    movd       [edx + 2], xmm1

+    lea        edx, [edx + 6]

+    jg         xloop

+    pop        esi

+    ret

+  }

+}

+// Reads 16xN bytes and produces 16 shorts at a time.

+// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.

+__declspec(naked) __declspec(align(16))

+void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

+                       uint16* dst_ptr, int src_width,

+                       int src_height) {

+  __asm {

+    push       esi

+    push       edi

+    push       ebx

+    push       ebp

+    mov        esi, [esp + 16 + 4]   // src_ptr

+    mov        edx, [esp + 16 + 8]   // src_stride

+    mov        edi, [esp + 16 + 12]  // dst_ptr

+    mov        ecx, [esp + 16 + 16]  // dst_width

+    mov        ebx, [esp + 16 + 20]  // height

+    pxor       xmm4, xmm4

+    dec        ebx

+    align      4

+  xloop:

+    // first row

+    movdqa     xmm0, [esi]

+    lea        eax, [esi + edx]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm4

+    punpckhbw  xmm1, xmm4

+    lea        esi, [esi + 16]

+    mov        ebp, ebx

+    test       ebp, ebp

+    je         ydone

+    // sum remaining rows

+    align      4

+  yloop:

+    movdqa     xmm2, [eax]       // read 16 pixels

+    lea        eax, [eax + edx]  // advance to next row

+    movdqa     xmm3, xmm2

+    punpcklbw  xmm2, xmm4

+    punpckhbw  xmm3, xmm4

+    paddusw    xmm0, xmm2        // sum 16 words

+    paddusw    xmm1, xmm3

+    sub        ebp, 1

+    jg         yloop

+    align      4

+  ydone:

+    movdqa     [edi], xmm0

+    movdqa     [edi + 16], xmm1

+    lea        edi, [edi + 32]

+    sub        ecx, 16

+    jg         xloop

+    pop        ebp

+    pop        ebx

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// Bilinear column filtering. SSSE3 version.

+// TODO(fbarchard): Port to Neon

+// TODO(fbarchard): Switch the following:

+//    xor        ebx, ebx

+//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels

+// To

+//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

+// when drmemory bug fixed.

+// https://code.google.com/p/drmemory/issues/detail?id=1396

+__declspec(naked) __declspec(align(16))

+void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

+                           int dst_width, int x, int dx) {

+  __asm {

+    push       ebx

+    push       esi

+    push       edi

+    mov        edi, [esp + 12 + 4]    // dst_ptr

+    mov        esi, [esp + 12 + 8]    // src_ptr

+    mov        ecx, [esp + 12 + 12]   // dst_width

+    movd       xmm2, [esp + 12 + 16]  // x

+    movd       xmm3, [esp + 12 + 20]  // dx

+    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.

+    movd       xmm5, eax

+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

+    psrlw      xmm6, 9

+    pextrw     eax, xmm2, 1         // get x0 integer. preroll

+    sub        ecx, 2

+    jl         xloop29

+    movdqa     xmm0, xmm2           // x1 = x0 + dx

+    paddd      xmm0, xmm3

+    punpckldq  xmm2, xmm0           // x0 x1

+    punpckldq  xmm3, xmm3           // dx dx

+    paddd      xmm3, xmm3           // dx * 2, dx * 2

+    pextrw     edx, xmm2, 3         // get x1 integer. preroll

+    // 2 Pixel loop.

+    align      4

+  xloop2:

+    movdqa     xmm1, xmm2           // x0, x1 fractions.

+    paddd      xmm2, xmm3           // x += dx

+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

+    movd       xmm0, ebx

+    psrlw      xmm1, 9              // 7 bit fractions.

+    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels

+    movd       xmm4, ebx

+    pshufb     xmm1, xmm5           // 0011

+    punpcklwd  xmm0, xmm4

+    pxor       xmm1, xmm6           // 0..7f and 7f..0

+    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.

+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.

+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.

+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.

+    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.

+    movd       ebx, xmm0

+    mov        [edi], bx

+    lea        edi, [edi + 2]

+    sub        ecx, 2               // 2 pixels

+    jge        xloop2

+    align      4

+ xloop29:

+    add        ecx, 2 - 1

+    jl         xloop99

+    // 1 pixel remainder

+    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

+    movd       xmm0, ebx

+    psrlw      xmm2, 9              // 7 bit fractions.

+    pshufb     xmm2, xmm5           // 0011

+    pxor       xmm2, xmm6           // 0..7f and 7f..0

+    pmaddubsw  xmm0, xmm2           // 16 bit

+    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.

+    packuswb   xmm0, xmm0           // 8 bits

+    movd       ebx, xmm0

+    mov        [edi], bl

+    align      4

+ xloop99:

+    pop        edi

+    pop        esi

+    pop        ebx

+    ret

+  }

+}

+// Reads 16 pixels, duplicates them and writes 32 pixels.

+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

+                       int dst_width, int x, int dx) {

+  __asm {

+    mov        edx, [esp + 4]    // dst_ptr

+    mov        eax, [esp + 8]    // src_ptr

+    mov        ecx, [esp + 12]   // dst_width

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    lea        eax,  [eax + 16]

+    movdqa     xmm1, xmm0

+    punpcklbw  xmm0, xmm0

+    punpckhbw  xmm1, xmm1

+    sub        ecx, 32

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    jg         wloop

+    ret

+  }

+}

+// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)

+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

+                            ptrdiff_t src_stride,

+                            uint8* dst_argb, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_argb

+                                     // src_stride ignored

+    mov        edx, [esp + 12]       // dst_argb

+    mov        ecx, [esp + 16]       // dst_width

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    shufps     xmm0, xmm1, 0xdd

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    ret

+  }

+}

+// Blends 8x1 rectangle to 4x1.

+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

+                                  ptrdiff_t src_stride,

+                                  uint8* dst_argb, int dst_width) {

+  __asm {

+    mov        eax, [esp + 4]        // src_argb

+                                     // src_stride ignored

+    mov        edx, [esp + 12]       // dst_argb

+    mov        ecx, [esp + 16]       // dst_width

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    lea        eax,  [eax + 32]

+    movdqa     xmm2, xmm0

+    shufps     xmm0, xmm1, 0x88      // even pixels

+    shufps     xmm2, xmm1, 0xdd      // odd pixels

+    pavgb      xmm0, xmm2

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    ret

+  }

+}

+// Blends 8x2 rectangle to 4x1.

+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

+                               ptrdiff_t src_stride,

+                               uint8* dst_argb, int dst_width) {

+  __asm {

+    push       esi

+    mov        eax, [esp + 4 + 4]    // src_argb

+    mov        esi, [esp + 4 + 8]    // src_stride

+    mov        edx, [esp + 4 + 12]   // dst_argb

+    mov        ecx, [esp + 4 + 16]   // dst_width

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    movdqa     xmm1, [eax + 16]

+    movdqa     xmm2, [eax + esi]

+    movdqa     xmm3, [eax + esi + 16]

+    lea        eax,  [eax + 32]

+    pavgb      xmm0, xmm2            // average rows

+    pavgb      xmm1, xmm3

+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)

+    shufps     xmm0, xmm1, 0x88      // even pixels

+    shufps     xmm2, xmm1, 0xdd      // odd pixels

+    pavgb      xmm0, xmm2

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    pop        esi

+    ret

+  }

+}

+// Reads 4 pixels at a time.

+// Alignment requirement: dst_argb 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8* dst_argb, int dst_width) {

+  __asm {

+    push       ebx

+    push       edi

+    mov        eax, [esp + 8 + 4]    // src_argb

+                                     // src_stride ignored

+    mov        ebx, [esp + 8 + 12]   // src_stepx

+    mov        edx, [esp + 8 + 16]   // dst_argb

+    mov        ecx, [esp + 8 + 20]   // dst_width

+    lea        ebx, [ebx * 4]

+    lea        edi, [ebx + ebx * 2]

+    align      4

+  wloop:

+    movd       xmm0, [eax]

+    movd       xmm1, [eax + ebx]

+    punpckldq  xmm0, xmm1

+    movd       xmm2, [eax + ebx * 2]

+    movd       xmm3, [eax + edi]

+    lea        eax,  [eax + ebx * 4]

+    punpckldq  xmm2, xmm3

+    punpcklqdq xmm0, xmm2

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    pop        edi

+    pop        ebx

+    ret

+  }

+}

+// Blends four 2x2 to 4x1.

+// Alignment requirement: dst_argb 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

+                                  ptrdiff_t src_stride,

+                                  int src_stepx,

+                                  uint8* dst_argb, int dst_width) {

+  __asm {

+    push       ebx

+    push       esi

+    push       edi

+    mov        eax, [esp + 12 + 4]    // src_argb

+    mov        esi, [esp + 12 + 8]    // src_stride

+    mov        ebx, [esp + 12 + 12]   // src_stepx

+    mov        edx, [esp + 12 + 16]   // dst_argb

+    mov        ecx, [esp + 12 + 20]   // dst_width

+    lea        esi, [eax + esi]       // row1 pointer

+    lea        ebx, [ebx * 4]

+    lea        edi, [ebx + ebx * 2]

+    align      4

+  wloop:

+    movq       xmm0, qword ptr [eax]  // row0 4 pairs

+    movhps     xmm0, qword ptr [eax + ebx]

+    movq       xmm1, qword ptr [eax + ebx * 2]

+    movhps     xmm1, qword ptr [eax + edi]

+    lea        eax,  [eax + ebx * 4]

+    movq       xmm2, qword ptr [esi]  // row1 4 pairs

+    movhps     xmm2, qword ptr [esi + ebx]

+    movq       xmm3, qword ptr [esi + ebx * 2]

+    movhps     xmm3, qword ptr [esi + edi]

+    lea        esi,  [esi + ebx * 4]

+    pavgb      xmm0, xmm2            // average rows

+    pavgb      xmm1, xmm3

+    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)

+    shufps     xmm0, xmm1, 0x88      // even pixels

+    shufps     xmm2, xmm1, 0xdd      // odd pixels

+    pavgb      xmm0, xmm2

+    sub        ecx, 4

+    movdqa     [edx], xmm0

+    lea        edx, [edx + 16]

+    jg         wloop

+    pop        edi

+    pop        esi

+    pop        ebx

+    ret

+  }

+}

+// Column scaling unfiltered. SSE2 version.

+__declspec(naked) __declspec(align(16))

+void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

+                        int dst_width, int x, int dx) {

+  __asm {

+    push       edi

+    push       esi

+    mov        edi, [esp + 8 + 4]    // dst_argb

+    mov        esi, [esp + 8 + 8]    // src_argb

+    mov        ecx, [esp + 8 + 12]   // dst_width

+    movd       xmm2, [esp + 8 + 16]  // x

+    movd       xmm3, [esp + 8 + 20]  // dx

+    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0

+    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0

+    paddd      xmm2, xmm0

+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2

+    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0

+    paddd      xmm2, xmm0            // x3 x2 x1 x0

+    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4

+    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4

+    pextrw     eax, xmm2, 1          // get x0 integer.

+    pextrw     edx, xmm2, 3          // get x1 integer.

+    cmp        ecx, 0

+    jle        xloop99

+    sub        ecx, 4

+    jl         xloop49

+    // 4 Pixel loop.

+    align      4

+ xloop4:

+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels

+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels

+    pextrw     eax, xmm2, 5           // get x2 integer.

+    pextrw     edx, xmm2, 7           // get x3 integer.

+    paddd      xmm2, xmm3             // x += dx

+    punpckldq  xmm0, xmm1             // x0 x1

+    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels

+    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels

+    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.

+    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.

+    punpckldq  xmm1, xmm4             // x2 x3

+    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3

+    sub        ecx, 4                 // 4 pixels

+    movdqu     [edi], xmm0

+    lea        edi, [edi + 16]

+    jge        xloop4

+    align      4

+ xloop49:

+    test       ecx, 2

+    je         xloop29

+    // 2 Pixels.

+    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels

+    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels

+    pextrw     eax, xmm2, 5           // get x2 integer.

+    punpckldq  xmm0, xmm1             // x0 x1

+    movq       qword ptr [edi], xmm0

+    lea        edi, [edi + 8]

+ xloop29:

+    test       ecx, 1

+    je         xloop99

+    // 1 Pixels.

+    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels

+    movd       dword ptr [edi], xmm0

+    align      4

+ xloop99:

+    pop        esi

+    pop        edi

+    ret

+  }

+}

+// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.

+// TODO(fbarchard): Port to Neon

+// Shuffle table for arranging 2 pixels into pairs for pmaddubsw

+static uvec8 kShuffleColARGB = {

+  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel

+  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

+};

+// Shuffle table for duplicating 2 fractions into 8 bytes each

+static uvec8 kShuffleFractions = {

+  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

+};

+__declspec(naked) __declspec(align(16))

+void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

+                               int dst_width, int x, int dx) {

+  __asm {

+    push       esi

+    push       edi

+    mov        edi, [esp + 8 + 4]    // dst_argb

+    mov        esi, [esp + 8 + 8]    // src_argb

+    mov        ecx, [esp + 8 + 12]   // dst_width

+    movd       xmm2, [esp + 8 + 16]  // x

+    movd       xmm3, [esp + 8 + 20]  // dx

+    movdqa     xmm4, kShuffleColARGB

+    movdqa     xmm5, kShuffleFractions

+    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

+    psrlw      xmm6, 9

+    pextrw     eax, xmm2, 1         // get x0 integer. preroll

+    sub        ecx, 2

+    jl         xloop29

+    movdqa     xmm0, xmm2           // x1 = x0 + dx

+    paddd      xmm0, xmm3

+    punpckldq  xmm2, xmm0           // x0 x1

+    punpckldq  xmm3, xmm3           // dx dx

+    paddd      xmm3, xmm3           // dx * 2, dx * 2

+    pextrw     edx, xmm2, 3         // get x1 integer. preroll

+    // 2 Pixel loop.

+    align      4

+  xloop2:

+    movdqa     xmm1, xmm2           // x0, x1 fractions.

+    paddd      xmm2, xmm3           // x += dx

+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels

+    psrlw      xmm1, 9              // 7 bit fractions.

+    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels

+    pshufb     xmm1, xmm5           // 0000000011111111

+    pshufb     xmm0, xmm4           // arrange pixels into pairs

+    pxor       xmm1, xmm6           // 0..7f and 7f..0

+    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.

+    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.

+    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.

+    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.

+    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.

+    movq       qword ptr [edi], xmm0

+    lea        edi, [edi + 8]

+    sub        ecx, 2               // 2 pixels

+    jge        xloop2

+    align      4

+ xloop29:

+    add        ecx, 2 - 1

+    jl         xloop99

+    // 1 pixel remainder

+    psrlw      xmm2, 9              // 7 bit fractions.

+    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels

+    pshufb     xmm2, xmm5           // 00000000

+    pshufb     xmm0, xmm4           // arrange pixels into pairs

+    pxor       xmm2, xmm6           // 0..7f and 7f..0

+    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.

+    psrlw      xmm0, 7

+    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.

+    movd       [edi], xmm0

+    align      4

+ xloop99:

+    pop        edi

+    pop        esi

+    ret

+  }

+}

+// Reads 4 pixels, duplicates them and writes 8 pixels.

+// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

+__declspec(naked) __declspec(align(16))

+void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

+                           int dst_width, int x, int dx) {

+  __asm {

+    mov        edx, [esp + 4]    // dst_argb

+    mov        eax, [esp + 8]    // src_argb

+    mov        ecx, [esp + 12]   // dst_width

+    align      4

+  wloop:

+    movdqa     xmm0, [eax]

+    lea        eax,  [eax + 16]

+    movdqa     xmm1, xmm0

+    punpckldq  xmm0, xmm0

+    punpckhdq  xmm1, xmm1

+    sub        ecx, 8

+    movdqa     [edx], xmm0

+    movdqa     [edx + 16], xmm1

+    lea        edx, [edx + 32]

+    jg         wloop

+    ret

+  }

+}

+// Divide num by div and return as 16.16 fixed point result.

+__declspec(naked) __declspec(align(16))

+int FixedDiv_X86(int num, int div) {

+  __asm {

+    mov        eax, [esp + 4]    // num

+    cdq                          // extend num to 64 bits

+    shld       edx, eax, 16      // 32.16

+    shl        eax, 16

+    idiv       dword ptr [esp + 8]

+    ret

+  }

+}

+// Divide num by div and return as 16.16 fixed point result.

+__declspec(naked) __declspec(align(16))

+int FixedDiv1_X86(int num, int div) {

+  __asm {

+    mov        eax, [esp + 4]    // num

+    mov        ecx, [esp + 8]    // denom

+    cdq                          // extend num to 64 bits

+    shld       edx, eax, 16      // 32.16

+    shl        eax, 16

+    sub        eax, 0x00010001

+    sbb        edx, 0

+    sub        ecx, 1

+    idiv       ecx

+    ret

+  }

+}

+#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

--- a/vpxdec.c

+++ b/vpxdec.c

@@ -873,8 +873,16 @@

         if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) {

+#if CONFIG_LIBYUV

           vpx_image_scale(img, scaled_img, kFilterBox);

           img = scaled_img;

+#else

+          fprintf(stderr, "Failed  to scale output frame: %s.\n"

+                  "Scaling is disabled in this configuration. "

+                  "To enable scaling, configure with --enable-libyuv\n",

+                  vpx_codec_error(&decoder));

+          return EXIT_FAILURE;

+#endif

--- a/vpxenc.c

+++ b/vpxenc.c

@@ -1268,6 +1268,7 @@

       fprintf(stderr, "%s can only scale 4:2:0 8bpp inputs\n", exec_name);

       exit(EXIT_FAILURE);

+#if CONFIG_LIBYUV

     if (!stream->img)

       stream->img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,

                                   cfg->g_w, cfg->g_h, 16);

@@ -1283,8 +1284,15 @@

               stream->img->stride[VPX_PLANE_V],

               stream->img->d_w, stream->img->d_h,

               kFilterBox);

     img = stream->img;

+#else

+    stream->encoder.err = 1;

+    ctx_exit_on_error(&stream->encoder,

+                      "Stream %d: Failed to encode frame.\n"

+                      "Scaling disabled in this configuration. \n"

+                      "To enable, configure with --enable-libyuv\n",

+                      stream->index);

+#endif

   vpx_usec_timer_start(&timer);

--

⑨