shithub: libvpx

--- a/build/make/configure.sh

+++ b/build/make/configure.sh

@@ -1223,6 +1223,11 @@

         esac

         if enabled msa; then

+          # TODO(libyuv:793)

+          # The new mips functions in libyuv do not build

+          # with the toolchains we currently use for testing.

+          soft_disable libyuv

           add_cflags -mmsa

           add_asflags -mmsa

           add_ldflags -mmsa

--- a/examples.mk

+++ b/examples.mk

@@ -23,7 +23,7 @@

                 third_party/libyuv/source/row_any.cc \

                 third_party/libyuv/source/row_common.cc \

                 third_party/libyuv/source/row_gcc.cc \

-                third_party/libyuv/source/row_mips.cc \

+                third_party/libyuv/source/row_msa.cc \

                 third_party/libyuv/source/row_neon.cc \

                 third_party/libyuv/source/row_neon64.cc \

                 third_party/libyuv/source/row_win.cc \

@@ -31,7 +31,7 @@

                 third_party/libyuv/source/scale_any.cc \

                 third_party/libyuv/source/scale_common.cc \

                 third_party/libyuv/source/scale_gcc.cc \

-                third_party/libyuv/source/scale_mips.cc \

+                third_party/libyuv/source/scale_msa.cc \

                 third_party/libyuv/source/scale_neon.cc \

                 third_party/libyuv/source/scale_neon64.cc \

                 third_party/libyuv/source/scale_win.cc \

--- /dev/null

+++ b/third_party/libyuv/LICENSE

@@ -1,0 +1,29 @@

+Copyright 2011 The LibYuv Project Authors. All rights reserved.

+Redistribution and use in source and binary forms, with or without

+modification, are permitted provided that the following conditions are

+met:

+  * Redistributions of source code must retain the above copyright

+    notice, this list of conditions and the following disclaimer.

+  * Redistributions in binary form must reproduce the above copyright

+    notice, this list of conditions and the following disclaimer in

+    the documentation and/or other materials provided with the

+    distribution.

+  * Neither the name of Google nor the names of its contributors may

+    be used to endorse or promote products derived from this software

+    without specific prior written permission.

+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

--- a/third_party/libyuv/README.libvpx

+++ b/third_party/libyuv/README.libvpx

@@ -1,6 +1,6 @@

 Name: libyuv

 URL: https://chromium.googlesource.com/libyuv/libyuv

-Version: de944ed8c74909ea6fbd743a22efe1e55e851b83

+Version: a37e7bfece9e0676ae90a1700b0ec85b0f4f22a1

 License: BSD

 License File: LICENSE

@@ -8,16 +8,15 @@

 libyuv is an open source project that includes YUV conversion and scaling

 functionality.

-The optimized scaler in libyuv is used in multiple resolution encoder example,

-which down-samples the original input video (f.g. 1280x720) a number of times

-in order to encode multiple resolution bit streams.

+The optimized scaler in libyuv is used in the multiple resolution encoder

+example which down-samples the original input video (f.g. 1280x720) a number of

+times in order to encode multiple resolution bit streams.

 Local Modifications:

-Disable some functions (webm:1514)

-rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \

-  LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \

-  all.gyp build_overrides/ chromium/ codereview.settings docs/ \

-  download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \

-  include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \

-  libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \

-  third_party/ tools/ unit_test/ util/ winarm.mk

+rm libyuv/include/libyuv.h libyuv/include/libyuv/compare_row.h

+mv libyuv/include tmp/

+mv libyuv/source tmp/

+mv libyuv/LICENSE tmp/

+rm -rf libyuv

+mv tmp/* third_party/libyuv/

--- a/third_party/libyuv/include/libyuv/basic_types.h

+++ b/third_party/libyuv/include/libyuv/basic_types.h

@@ -8,83 +8,37 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_

 #define INCLUDE_LIBYUV_BASIC_TYPES_H_

-#include <stddef.h>  // for NULL, size_t

+#include <stddef.h>  // For size_t and NULL

+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)

+#define INT_TYPES_DEFINED

 #if defined(_MSC_VER) && (_MSC_VER < 1600)

 #include <sys/types.h>  // for uintptr_t on x86

+typedef unsigned __int64 uint64_t;

+typedef __int64 int64_t;

+typedef unsigned int uint32_t;

+typedef int int32_t;

+typedef unsigned short uint16_t;

+typedef short int16_t;

+typedef unsigned char uint8_t;

+typedef signed char int8_t;

 #else

-#include <stdint.h>  // for uintptr_t

-#endif

-#ifndef GG_LONGLONG

-#ifndef INT_TYPES_DEFINED

-#define INT_TYPES_DEFINED

-#ifdef COMPILER_MSVC

-typedef unsigned __int64 uint64;

-typedef __int64 int64;

-#ifndef INT64_C

-#define INT64_C(x) x ## I64

-#endif

-#ifndef UINT64_C

-#define UINT64_C(x) x ## UI64

-#endif

-#define INT64_F "I64"

-#else  // COMPILER_MSVC

-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)

-typedef unsigned long uint64;  // NOLINT

-typedef long int64;  // NOLINT

-#ifndef INT64_C

-#define INT64_C(x) x ## L

-#endif

-#ifndef UINT64_C

-#define UINT64_C(x) x ## UL

-#endif

-#define INT64_F "l"

-#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)

-typedef unsigned long long uint64;  // NOLINT

-typedef long long int64;  // NOLINT

-#ifndef INT64_C

-#define INT64_C(x) x ## LL

-#endif

-#ifndef UINT64_C

-#define UINT64_C(x) x ## ULL

-#endif

-#define INT64_F "ll"

-#endif  // __LP64__

-#endif  // COMPILER_MSVC

-typedef unsigned int uint32;

-typedef int int32;

-typedef unsigned short uint16;  // NOLINT

-typedef short int16;  // NOLINT

-typedef unsigned char uint8;

-typedef signed char int8;

+#include <stdint.h>  // for uintptr_t and C99 types

+#endif               // defined(_MSC_VER) && (_MSC_VER < 1600)

+typedef uint64_t uint64;

+typedef int64_t int64;

+typedef uint32_t uint32;

+typedef int32_t int32;

+typedef uint16_t uint16;

+typedef int16_t int16;

+typedef uint8_t uint8;

+typedef int8_t int8;

 #endif  // INT_TYPES_DEFINED

-#endif  // GG_LONGLONG

-// Detect compiler is for x86 or x64.

-#if defined(__x86_64__) || defined(_M_X64) || \

-    defined(__i386__) || defined(_M_IX86)

-#define CPU_X86 1

-#endif

-// Detect compiler is for ARM.

-#if defined(__arm__) || defined(_M_ARM)

-#define CPU_ARM 1

-#endif

-#ifndef ALIGNP

-#ifdef __cplusplus

-#define ALIGNP(p, t) \

-    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \

-    ((t) - 1)) & ~((t) - 1))))

-#else

-#define ALIGNP(p, t) \

-    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */

-#endif

-#endif

 #if !defined(LIBYUV_API)

 #if defined(_WIN32) || defined(__CYGWIN__)

 #if defined(LIBYUV_BUILDING_SHARED_LIBRARY)

@@ -95,24 +49,17 @@

 #define LIBYUV_API

 #endif  // LIBYUV_BUILDING_SHARED_LIBRARY

 #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \

-    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \

-    defined(LIBYUV_USING_SHARED_LIBRARY))

-#define LIBYUV_API __attribute__ ((visibility ("default")))

+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) ||                      \

+     defined(LIBYUV_USING_SHARED_LIBRARY))

+#define LIBYUV_API __attribute__((visibility("default")))

 #else

 #define LIBYUV_API

 #endif  // __GNUC__

 #endif  // LIBYUV_API

+// TODO(fbarchard): Remove bool macros.

 #define LIBYUV_BOOL int

 #define LIBYUV_FALSE 0

 #define LIBYUV_TRUE 1

-// Visual C x86 or GCC little endian.

-#if defined(__x86_64__) || defined(_M_X64) || \

-  defined(__i386__) || defined(_M_IX86) || \

-  defined(__arm__) || defined(_M_ARM) || \

-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

-#define LIBYUV_LITTLE_ENDIAN

-#endif

-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_

--- a/third_party/libyuv/include/libyuv/compare.h

+++ b/third_party/libyuv/include/libyuv/compare.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_COMPARE_H_

 #define INCLUDE_LIBYUV_COMPARE_H_

 #include "libyuv/basic_types.h"

@@ -20,55 +20,88 @@

 // Compute a hash for specified memory. Seed of 5381 recommended.

 LIBYUV_API

-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);

+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);

+// Hamming Distance

+LIBYUV_API

+uint64_t ComputeHammingDistance(const uint8_t* src_a,

+                                const uint8_t* src_b,

+                                int count);

 // Scan an opaque argb image and return fourcc based on alpha offset.

 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.

 LIBYUV_API

-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);

+uint32_t ARGBDetect(const uint8_t* argb,

+                    int stride_argb,

+                    int width,

+                    int height);

 // Sum Square Error - used to compute Mean Square Error or PSNR.

 LIBYUV_API

-uint64 ComputeSumSquareError(const uint8* src_a,

-                             const uint8* src_b, int count);

+uint64_t ComputeSumSquareError(const uint8_t* src_a,

+                               const uint8_t* src_b,

+                               int count);

 LIBYUV_API

-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,

-                                  const uint8* src_b, int stride_b,

-                                  int width, int height);

+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,

+                                    int stride_a,

+                                    const uint8_t* src_b,

+                                    int stride_b,

+                                    int width,

+                                    int height);

 static const int kMaxPsnr = 128;

 LIBYUV_API

-double SumSquareErrorToPsnr(uint64 sse, uint64 count);

+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);

 LIBYUV_API

-double CalcFramePsnr(const uint8* src_a, int stride_a,

-                     const uint8* src_b, int stride_b,

-                     int width, int height);

+double CalcFramePsnr(const uint8_t* src_a,

+                     int stride_a,

+                     const uint8_t* src_b,

+                     int stride_b,

+                     int width,

+                     int height);

 LIBYUV_API

-double I420Psnr(const uint8* src_y_a, int stride_y_a,

-                const uint8* src_u_a, int stride_u_a,

-                const uint8* src_v_a, int stride_v_a,

-                const uint8* src_y_b, int stride_y_b,

-                const uint8* src_u_b, int stride_u_b,

-                const uint8* src_v_b, int stride_v_b,

-                int width, int height);

+double I420Psnr(const uint8_t* src_y_a,

+                int stride_y_a,

+                const uint8_t* src_u_a,

+                int stride_u_a,

+                const uint8_t* src_v_a,

+                int stride_v_a,

+                const uint8_t* src_y_b,

+                int stride_y_b,

+                const uint8_t* src_u_b,

+                int stride_u_b,

+                const uint8_t* src_v_b,

+                int stride_v_b,

+                int width,

+                int height);

 LIBYUV_API

-double CalcFrameSsim(const uint8* src_a, int stride_a,

-                     const uint8* src_b, int stride_b,

-                     int width, int height);

+double CalcFrameSsim(const uint8_t* src_a,

+                     int stride_a,

+                     const uint8_t* src_b,

+                     int stride_b,

+                     int width,

+                     int height);

 LIBYUV_API

-double I420Ssim(const uint8* src_y_a, int stride_y_a,

-                const uint8* src_u_a, int stride_u_a,

-                const uint8* src_v_a, int stride_v_a,

-                const uint8* src_y_b, int stride_y_b,

-                const uint8* src_u_b, int stride_u_b,

-                const uint8* src_v_b, int stride_v_b,

-                int width, int height);

+double I420Ssim(const uint8_t* src_y_a,

+                int stride_y_a,

+                const uint8_t* src_u_a,

+                int stride_u_a,

+                const uint8_t* src_v_a,

+                int stride_v_a,

+                const uint8_t* src_y_b,

+                int stride_y_b,

+                const uint8_t* src_u_b,

+                int stride_u_b,

+                const uint8_t* src_v_b,

+                int stride_v_b,

+                int width,

+                int height);

 #ifdef __cplusplus

 }  // extern "C"

@@ -75,4 +108,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_COMPARE_H_

--- a/third_party/libyuv/include/libyuv/convert.h

+++ b/third_party/libyuv/include/libyuv/convert.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_CONVERT_H_

 #define INCLUDE_LIBYUV_CONVERT_H_

 #include "libyuv/basic_types.h"

@@ -16,8 +16,8 @@

 #include "libyuv/rotate.h"  // For enum RotationMode.

 // TODO(fbarchard): fix WebRTC source to include following libyuv headers:

-#include "libyuv/convert_argb.h"  // For WebRTC I420ToARGB. b/620

-#include "libyuv/convert_from.h"  // For WebRTC ConvertFromI420. b/620

+#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620

+#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620

 #include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618

 #ifdef __cplusplus

@@ -27,195 +27,335 @@

 // Convert I444 to I420.

 LIBYUV_API

-int I444ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int I444ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert I422 to I420.

 LIBYUV_API

-int I422ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int I422ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

-// Convert I411 to I420.

-LIBYUV_API

-int I411ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

 // Copy I420 to I420.

 #define I420ToI420 I420Copy

 LIBYUV_API

-int I420Copy(const uint8* src_y, int src_stride_y,

-             const uint8* src_u, int src_stride_u,

-             const uint8* src_v, int src_stride_v,

-             uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int width, int height);

+int I420Copy(const uint8_t* src_y,

+             int src_stride_y,

+             const uint8_t* src_u,

+             int src_stride_u,

+             const uint8_t* src_v,

+             int src_stride_v,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height);

+// Copy I010 to I010

+#define I010ToI010 I010Copy

+#define H010ToH010 I010Copy

+LIBYUV_API

+int I010Copy(const uint16_t* src_y,

+             int src_stride_y,

+             const uint16_t* src_u,

+             int src_stride_u,

+             const uint16_t* src_v,

+             int src_stride_v,

+             uint16_t* dst_y,

+             int dst_stride_y,

+             uint16_t* dst_u,

+             int dst_stride_u,

+             uint16_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height);

+// Convert 10 bit YUV to 8 bit

+#define H010ToH420 I010ToI420

+LIBYUV_API

+int I010ToI420(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert I400 (grey) to I420.

 LIBYUV_API

-int I400ToI420(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int I400ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 #define J400ToJ420 I400ToI420

 // Convert NV12 to I420.

 LIBYUV_API

-int NV12ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_uv, int src_stride_uv,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int NV12ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_uv,

+               int src_stride_uv,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert NV21 to I420.

 LIBYUV_API

-int NV21ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_vu, int src_stride_vu,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int NV21ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_vu,

+               int src_stride_vu,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert YUY2 to I420.

 LIBYUV_API

-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int YUY2ToI420(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert UYVY to I420.

 LIBYUV_API

-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int UYVYToI420(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert M420 to I420.

 LIBYUV_API

-int M420ToI420(const uint8* src_m420, int src_stride_m420,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int M420ToI420(const uint8_t* src_m420,

+               int src_stride_m420,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert Android420 to I420.

 LIBYUV_API

-int Android420ToI420(const uint8* src_y, int src_stride_y,

-                     const uint8* src_u, int src_stride_u,

-                     const uint8* src_v, int src_stride_v,

-                     int pixel_stride_uv,

-                     uint8* dst_y, int dst_stride_y,

-                     uint8* dst_u, int dst_stride_u,

-                     uint8* dst_v, int dst_stride_v,

-                     int width, int height);

+int Android420ToI420(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_u,

+                     int src_stride_u,

+                     const uint8_t* src_v,

+                     int src_stride_v,

+                     int src_pixel_stride_uv,

+                     uint8_t* dst_y,

+                     int dst_stride_y,

+                     uint8_t* dst_u,

+                     int dst_stride_u,

+                     uint8_t* dst_v,

+                     int dst_stride_v,

+                     int width,

+                     int height);

 // ARGB little endian (bgra in memory) to I420.

 LIBYUV_API

-int ARGBToI420(const uint8* src_frame, int src_stride_frame,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int ARGBToI420(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // BGRA little endian (argb in memory) to I420.

 LIBYUV_API

-int BGRAToI420(const uint8* src_frame, int src_stride_frame,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int BGRAToI420(const uint8_t* src_bgra,

+               int src_stride_bgra,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // ABGR little endian (rgba in memory) to I420.

 LIBYUV_API

-int ABGRToI420(const uint8* src_frame, int src_stride_frame,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int ABGRToI420(const uint8_t* src_abgr,

+               int src_stride_abgr,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // RGBA little endian (abgr in memory) to I420.

 LIBYUV_API

-int RGBAToI420(const uint8* src_frame, int src_stride_frame,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int RGBAToI420(const uint8_t* src_rgba,

+               int src_stride_rgba,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // RGB little endian (bgr in memory) to I420.

 LIBYUV_API

-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,

-                uint8* dst_y, int dst_stride_y,

-                uint8* dst_u, int dst_stride_u,

-                uint8* dst_v, int dst_stride_v,

-                int width, int height);

+int RGB24ToI420(const uint8_t* src_rgb24,

+                int src_stride_rgb24,

+                uint8_t* dst_y,

+                int dst_stride_y,

+                uint8_t* dst_u,

+                int dst_stride_u,

+                uint8_t* dst_v,

+                int dst_stride_v,

+                int width,

+                int height);

 // RGB big endian (rgb in memory) to I420.

 LIBYUV_API

-int RAWToI420(const uint8* src_frame, int src_stride_frame,

-              uint8* dst_y, int dst_stride_y,

-              uint8* dst_u, int dst_stride_u,

-              uint8* dst_v, int dst_stride_v,

-              int width, int height);

+int RAWToI420(const uint8_t* src_raw,

+              int src_stride_raw,

+              uint8_t* dst_y,

+              int dst_stride_y,

+              uint8_t* dst_u,

+              int dst_stride_u,

+              uint8_t* dst_v,

+              int dst_stride_v,

+              int width,

+              int height);

 // RGB16 (RGBP fourcc) little endian to I420.

 LIBYUV_API

-int RGB565ToI420(const uint8* src_frame, int src_stride_frame,

-                 uint8* dst_y, int dst_stride_y,

-                 uint8* dst_u, int dst_stride_u,

-                 uint8* dst_v, int dst_stride_v,

-                 int width, int height);

+int RGB565ToI420(const uint8_t* src_rgb565,

+                 int src_stride_rgb565,

+                 uint8_t* dst_y,

+                 int dst_stride_y,

+                 uint8_t* dst_u,

+                 int dst_stride_u,

+                 uint8_t* dst_v,

+                 int dst_stride_v,

+                 int width,

+                 int height);

 // RGB15 (RGBO fourcc) little endian to I420.

 LIBYUV_API

-int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,

-                   uint8* dst_y, int dst_stride_y,

-                   uint8* dst_u, int dst_stride_u,

-                   uint8* dst_v, int dst_stride_v,

-                   int width, int height);

+int ARGB1555ToI420(const uint8_t* src_argb1555,

+                   int src_stride_argb1555,

+                   uint8_t* dst_y,

+                   int dst_stride_y,

+                   uint8_t* dst_u,

+                   int dst_stride_u,

+                   uint8_t* dst_v,

+                   int dst_stride_v,

+                   int width,

+                   int height);

 // RGB12 (R444 fourcc) little endian to I420.

 LIBYUV_API

-int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,

-                   uint8* dst_y, int dst_stride_y,

-                   uint8* dst_u, int dst_stride_u,

-                   uint8* dst_v, int dst_stride_v,

-                   int width, int height);

+int ARGB4444ToI420(const uint8_t* src_argb4444,

+                   int src_stride_argb4444,

+                   uint8_t* dst_y,

+                   int dst_stride_y,

+                   uint8_t* dst_u,

+                   int dst_stride_u,

+                   uint8_t* dst_v,

+                   int dst_stride_v,

+                   int width,

+                   int height);

 #ifdef HAVE_JPEG

 // src_width/height provided by capture.

 // dst_width/height for clipping determine final size.

 LIBYUV_API

-int MJPGToI420(const uint8* sample, size_t sample_size,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int src_width, int src_height,

-               int dst_width, int dst_height);

+int MJPGToI420(const uint8_t* sample,

+               size_t sample_size,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int src_width,

+               int src_height,

+               int dst_width,

+               int dst_height);

 // Query size of MJPG in pixels.

 LIBYUV_API

-int MJPGSize(const uint8* sample, size_t sample_size,

-             int* width, int* height);

+int MJPGSize(const uint8_t* sample,

+             size_t sample_size,

+             int* width,

+             int* height);

 #endif

 // Convert camera sample to I420 with cropping, rotation and vertical flip.

@@ -238,18 +378,25 @@

 //    Must be less than or equal to src_width/src_height

 //    Cropping parameters are pre-rotation.

 // "rotation" can be 0, 90, 180 or 270.

-// "format" is a fourcc. ie 'I420', 'YUY2'

+// "fourcc" is a fourcc. ie 'I420', 'YUY2'

 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.

 LIBYUV_API

-int ConvertToI420(const uint8* src_frame, size_t src_size,

-                  uint8* dst_y, int dst_stride_y,

-                  uint8* dst_u, int dst_stride_u,

-                  uint8* dst_v, int dst_stride_v,

-                  int crop_x, int crop_y,

-                  int src_width, int src_height,

-                  int crop_width, int crop_height,

+int ConvertToI420(const uint8_t* sample,

+                  size_t sample_size,

+                  uint8_t* dst_y,

+                  int dst_stride_y,

+                  uint8_t* dst_u,

+                  int dst_stride_u,

+                  uint8_t* dst_v,

+                  int dst_stride_v,

+                  int crop_x,

+                  int crop_y,

+                  int src_width,

+                  int src_height,

+                  int crop_width,

+                  int crop_height,

                   enum RotationMode rotation,

-                  uint32 format);

+                  uint32_t fourcc);

 #ifdef __cplusplus

 }  // extern "C"

@@ -256,4 +403,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_CONVERT_H_

--- a/third_party/libyuv/include/libyuv/convert_argb.h

+++ b/third_party/libyuv/include/libyuv/convert_argb.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_

 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_

 #include "libyuv/basic_types.h"

@@ -30,95 +30,205 @@

 // Copy ARGB to ARGB.

 LIBYUV_API

-int ARGBCopy(const uint8* src_argb, int src_stride_argb,

-             uint8* dst_argb, int dst_stride_argb,

-             int width, int height);

+int ARGBCopy(const uint8_t* src_argb,

+             int src_stride_argb,

+             uint8_t* dst_argb,

+             int dst_stride_argb,

+             int width,

+             int height);

 // Convert I420 to ARGB.

 LIBYUV_API

-int I420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I420ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Duplicate prototype for function in convert_from.h for remoting.

 LIBYUV_API

-int I420ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I420ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

+// Convert I010 to ARGB.

+LIBYUV_API

+int I010ToARGB(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

+// Convert I010 to ARGB.

+LIBYUV_API

+int I010ToARGB(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

+// Convert I010 to ABGR.

+LIBYUV_API

+int I010ToABGR(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

+// Convert H010 to ARGB.

+LIBYUV_API

+int H010ToARGB(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

+// Convert H010 to ABGR.

+LIBYUV_API

+int H010ToABGR(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

 // Convert I422 to ARGB.

 LIBYUV_API

-int I422ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I422ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert I444 to ARGB.

 LIBYUV_API

-int I444ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I444ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert J444 to ARGB.

 LIBYUV_API

-int J444ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int J444ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert I444 to ABGR.

 LIBYUV_API

-int I444ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height);

+int I444ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

-// Convert I411 to ARGB.

-LIBYUV_API

-int I411ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

 // Convert I420 with Alpha to preattenuated ARGB.

 LIBYUV_API

-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,

-                    const uint8* src_u, int src_stride_u,

-                    const uint8* src_v, int src_stride_v,

-                    const uint8* src_a, int src_stride_a,

-                    uint8* dst_argb, int dst_stride_argb,

-                    int width, int height, int attenuate);

+int I420AlphaToARGB(const uint8_t* src_y,

+                    int src_stride_y,

+                    const uint8_t* src_u,

+                    int src_stride_u,

+                    const uint8_t* src_v,

+                    int src_stride_v,

+                    const uint8_t* src_a,

+                    int src_stride_a,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    int width,

+                    int height,

+                    int attenuate);

 // Convert I420 with Alpha to preattenuated ABGR.

 LIBYUV_API

-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,

-                    const uint8* src_u, int src_stride_u,

-                    const uint8* src_v, int src_stride_v,

-                    const uint8* src_a, int src_stride_a,

-                    uint8* dst_abgr, int dst_stride_abgr,

-                    int width, int height, int attenuate);

+int I420AlphaToABGR(const uint8_t* src_y,

+                    int src_stride_y,

+                    const uint8_t* src_u,

+                    int src_stride_u,

+                    const uint8_t* src_v,

+                    int src_stride_v,

+                    const uint8_t* src_a,

+                    int src_stride_a,

+                    uint8_t* dst_abgr,

+                    int dst_stride_abgr,

+                    int width,

+                    int height,

+                    int attenuate);

 // Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.

 LIBYUV_API

-int I400ToARGB(const uint8* src_y, int src_stride_y,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I400ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert J400 (jpeg grey) to ARGB.

 LIBYUV_API

-int J400ToARGB(const uint8* src_y, int src_stride_y,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int J400ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Alias.

 #define YToARGB I400ToARGB

@@ -125,117 +235,291 @@

 // Convert NV12 to ARGB.

 LIBYUV_API

-int NV12ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_uv, int src_stride_uv,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int NV12ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_uv,

+               int src_stride_uv,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert NV21 to ARGB.

 LIBYUV_API

-int NV21ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_vu, int src_stride_vu,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int NV21ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_vu,

+               int src_stride_vu,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

+// Convert NV12 to ABGR.

+int NV12ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_uv,

+               int src_stride_uv,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

+// Convert NV21 to ABGR.

+LIBYUV_API

+int NV21ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_vu,

+               int src_stride_vu,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

+// Convert NV12 to RGB24.

+LIBYUV_API

+int NV12ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_uv,

+                int src_stride_uv,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height);

+// Convert NV21 to RGB24.

+LIBYUV_API

+int NV21ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_vu,

+                int src_stride_vu,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height);

 // Convert M420 to ARGB.

 LIBYUV_API

-int M420ToARGB(const uint8* src_m420, int src_stride_m420,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int M420ToARGB(const uint8_t* src_m420,

+               int src_stride_m420,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert YUY2 to ARGB.

 LIBYUV_API

-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int YUY2ToARGB(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert UYVY to ARGB.

 LIBYUV_API

-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int UYVYToARGB(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert J420 to ARGB.

 LIBYUV_API

-int J420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int J420ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert J422 to ARGB.

 LIBYUV_API

-int J422ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int J422ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert J420 to ABGR.

 LIBYUV_API

-int J420ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height);

+int J420ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

 // Convert J422 to ABGR.

 LIBYUV_API

-int J422ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height);

+int J422ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

 // Convert H420 to ARGB.

 LIBYUV_API

-int H420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int H420ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert H422 to ARGB.

 LIBYUV_API

-int H422ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int H422ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert H420 to ABGR.

 LIBYUV_API

-int H420ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height);

+int H420ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

 // Convert H422 to ABGR.

 LIBYUV_API

-int H422ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height);

+int H422ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

+// Convert H010 to ARGB.

+LIBYUV_API

+int H010ToARGB(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

+// Convert I010 to AR30.

+LIBYUV_API

+int I010ToAR30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height);

+// Convert H010 to AR30.

+LIBYUV_API

+int H010ToAR30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height);

+// Convert I010 to AB30.

+LIBYUV_API

+int I010ToAB30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ab30,

+               int dst_stride_ab30,

+               int width,

+               int height);

+// Convert H010 to AB30.

+LIBYUV_API

+int H010ToAB30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ab30,

+               int dst_stride_ab30,

+               int width,

+               int height);

 // BGRA little endian (argb in memory) to ARGB.

 LIBYUV_API

-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int BGRAToARGB(const uint8_t* src_bgra,

+               int src_stride_bgra,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // ABGR little endian (rgba in memory) to ARGB.

 LIBYUV_API

-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int ABGRToARGB(const uint8_t* src_abgr,

+               int src_stride_abgr,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // RGBA little endian (abgr in memory) to ARGB.

 LIBYUV_API

-int RGBAToARGB(const uint8* src_frame, int src_stride_frame,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int RGBAToARGB(const uint8_t* src_rgba,

+               int src_stride_rgba,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Deprecated function name.

 #define BG24ToARGB RGB24ToARGB

@@ -242,46 +526,125 @@

 // RGB little endian (bgr in memory) to ARGB.

 LIBYUV_API

-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,

-                uint8* dst_argb, int dst_stride_argb,

-                int width, int height);

+int RGB24ToARGB(const uint8_t* src_rgb24,

+                int src_stride_rgb24,

+                uint8_t* dst_argb,

+                int dst_stride_argb,

+                int width,

+                int height);

 // RGB big endian (rgb in memory) to ARGB.

 LIBYUV_API

-int RAWToARGB(const uint8* src_frame, int src_stride_frame,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height);

+int RAWToARGB(const uint8_t* src_raw,

+              int src_stride_raw,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height);

 // RGB16 (RGBP fourcc) little endian to ARGB.

 LIBYUV_API

-int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,

-                 uint8* dst_argb, int dst_stride_argb,

-                 int width, int height);

+int RGB565ToARGB(const uint8_t* src_rgb565,

+                 int src_stride_rgb565,

+                 uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int width,

+                 int height);

 // RGB15 (RGBO fourcc) little endian to ARGB.

 LIBYUV_API

-int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,

-                   uint8* dst_argb, int dst_stride_argb,

-                   int width, int height);

+int ARGB1555ToARGB(const uint8_t* src_argb1555,

+                   int src_stride_argb1555,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   int width,

+                   int height);

 // RGB12 (R444 fourcc) little endian to ARGB.

 LIBYUV_API

-int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,

-                   uint8* dst_argb, int dst_stride_argb,

-                   int width, int height);

+int ARGB4444ToARGB(const uint8_t* src_argb4444,

+                   int src_stride_argb4444,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   int width,

+                   int height);

+// Aliases

+#define AB30ToARGB AR30ToABGR

+#define AB30ToABGR AR30ToARGB

+#define AB30ToAR30 AR30ToAB30

+// Convert AR30 To ARGB.

+LIBYUV_API

+int AR30ToARGB(const uint8_t* src_ar30,

+               int src_stride_ar30,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

+// Convert AR30 To ABGR.

+LIBYUV_API

+int AR30ToABGR(const uint8_t* src_ar30,

+               int src_stride_ar30,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

+// Convert AR30 To AB30.

+LIBYUV_API

+int AR30ToAB30(const uint8_t* src_ar30,

+               int src_stride_ar30,

+               uint8_t* dst_ab30,

+               int dst_stride_ab30,

+               int width,

+               int height);

 #ifdef HAVE_JPEG

 // src_width/height provided by capture

 // dst_width/height for clipping determine final size.

 LIBYUV_API

-int MJPGToARGB(const uint8* sample, size_t sample_size,

-               uint8* dst_argb, int dst_stride_argb,

-               int src_width, int src_height,

-               int dst_width, int dst_height);

+int MJPGToARGB(const uint8_t* sample,

+               size_t sample_size,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int src_width,

+               int src_height,

+               int dst_width,

+               int dst_height);

 #endif

+// Convert Android420 to ARGB.

+LIBYUV_API

+int Android420ToARGB(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_u,

+                     int src_stride_u,

+                     const uint8_t* src_v,

+                     int src_stride_v,

+                     int src_pixel_stride_uv,

+                     uint8_t* dst_argb,

+                     int dst_stride_argb,

+                     int width,

+                     int height);

+// Convert Android420 to ABGR.

+LIBYUV_API

+int Android420ToABGR(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_u,

+                     int src_stride_u,

+                     const uint8_t* src_v,

+                     int src_stride_v,

+                     int src_pixel_stride_uv,

+                     uint8_t* dst_abgr,

+                     int dst_stride_abgr,

+                     int width,

+                     int height);

 // Convert camera sample to ARGB with cropping, rotation and vertical flip.

-// "src_size" is needed to parse MJPG.

+// "sample_size" is needed to parse MJPG.

 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.

 //   Normally this would be the same as dst_width, with recommended alignment

 //   to 16 bytes for better efficiency.

@@ -300,16 +663,21 @@

 //    Must be less than or equal to src_width/src_height

 //    Cropping parameters are pre-rotation.

 // "rotation" can be 0, 90, 180 or 270.

-// "format" is a fourcc. ie 'I420', 'YUY2'

+// "fourcc" is a fourcc. ie 'I420', 'YUY2'

 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.

 LIBYUV_API

-int ConvertToARGB(const uint8* src_frame, size_t src_size,

-                  uint8* dst_argb, int dst_stride_argb,

-                  int crop_x, int crop_y,

-                  int src_width, int src_height,

-                  int crop_width, int crop_height,

+int ConvertToARGB(const uint8_t* sample,

+                  size_t sample_size,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int crop_x,

+                  int crop_y,

+                  int src_width,

+                  int src_height,

+                  int crop_width,

+                  int crop_height,

                   enum RotationMode rotation,

-                  uint32 format);

+                  uint32_t fourcc);

 #ifdef __cplusplus

 }  // extern "C"

@@ -316,4 +684,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_

--- a/third_party/libyuv/include/libyuv/convert_from.h

+++ b/third_party/libyuv/include/libyuv/convert_from.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_

 #define INCLUDE_LIBYUV_CONVERT_FROM_H_

 #include "libyuv/basic_types.h"

@@ -21,155 +21,318 @@

 // See Also convert.h for conversions from formats to I420.

-// I420Copy in convert to I420ToI420.

+// Convert 8 bit YUV to 10 bit.

+#define H420ToH010 I420ToI010

+int I420ToI010(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint16_t* dst_y,

+               int dst_stride_y,

+               uint16_t* dst_u,

+               int dst_stride_u,

+               uint16_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToI422(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int I420ToI422(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToI444(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int I420ToI444(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.

 LIBYUV_API

-int I420ToI411(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int I400Copy(const uint8_t* src_y,

+             int src_stride_y,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             int width,

+             int height);

-// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.

 LIBYUV_API

-int I400Copy(const uint8* src_y, int src_stride_y,

-             uint8* dst_y, int dst_stride_y,

-             int width, int height);

+int I420ToNV12(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToNV12(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height);

+int I420ToNV21(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_vu,

+               int dst_stride_vu,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToNV21(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_vu, int dst_stride_vu,

-               int width, int height);

+int I420ToYUY2(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_yuy2,

+               int dst_stride_yuy2,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToYUY2(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_frame, int dst_stride_frame,

-               int width, int height);

+int I420ToUYVY(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_uyvy,

+               int dst_stride_uyvy,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToUYVY(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_frame, int dst_stride_frame,

-               int width, int height);

+int I420ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I420ToBGRA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_bgra,

+               int dst_stride_bgra,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToBGRA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I420ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int I420ToRGBA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_rgba,

+               int dst_stride_rgba,

+               int width,

+               int height);

 LIBYUV_API

-int I420ToRGBA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_rgba, int dst_stride_rgba,

-               int width, int height);

+int I420ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_u,

+                int src_stride_u,

+                const uint8_t* src_v,

+                int src_stride_v,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height);

 LIBYUV_API

-int I420ToRGB24(const uint8* src_y, int src_stride_y,

-                const uint8* src_u, int src_stride_u,

-                const uint8* src_v, int src_stride_v,

-                uint8* dst_frame, int dst_stride_frame,

-                int width, int height);

+int I420ToRAW(const uint8_t* src_y,

+              int src_stride_y,

+              const uint8_t* src_u,

+              int src_stride_u,

+              const uint8_t* src_v,

+              int src_stride_v,

+              uint8_t* dst_raw,

+              int dst_stride_raw,

+              int width,

+              int height);

 LIBYUV_API

-int I420ToRAW(const uint8* src_y, int src_stride_y,

-              const uint8* src_u, int src_stride_u,

-              const uint8* src_v, int src_stride_v,

-              uint8* dst_frame, int dst_stride_frame,

-              int width, int height);

+int H420ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_u,

+                int src_stride_u,

+                const uint8_t* src_v,

+                int src_stride_v,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height);

 LIBYUV_API

-int I420ToRGB565(const uint8* src_y, int src_stride_y,

-                 const uint8* src_u, int src_stride_u,

-                 const uint8* src_v, int src_stride_v,

-                 uint8* dst_frame, int dst_stride_frame,

-                 int width, int height);

+int H420ToRAW(const uint8_t* src_y,

+              int src_stride_y,

+              const uint8_t* src_u,

+              int src_stride_u,

+              const uint8_t* src_v,

+              int src_stride_v,

+              uint8_t* dst_raw,

+              int dst_stride_raw,

+              int width,

+              int height);

+LIBYUV_API

+int I420ToRGB565(const uint8_t* src_y,

+                 int src_stride_y,

+                 const uint8_t* src_u,

+                 int src_stride_u,

+                 const uint8_t* src_v,

+                 int src_stride_v,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height);

+LIBYUV_API

+int I422ToRGB565(const uint8_t* src_y,

+                 int src_stride_y,

+                 const uint8_t* src_u,

+                 int src_stride_u,

+                 const uint8_t* src_v,

+                 int src_stride_v,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height);

 // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).

 // Values in dither matrix from 0 to 7 recommended.

 // The order of the dither matrix is first byte is upper left.

 LIBYUV_API

-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,

-                       const uint8* src_u, int src_stride_u,

-                       const uint8* src_v, int src_stride_v,

-                       uint8* dst_frame, int dst_stride_frame,

-                       const uint8* dither4x4, int width, int height);

+int I420ToRGB565Dither(const uint8_t* src_y,

+                       int src_stride_y,

+                       const uint8_t* src_u,

+                       int src_stride_u,

+                       const uint8_t* src_v,

+                       int src_stride_v,

+                       uint8_t* dst_rgb565,

+                       int dst_stride_rgb565,

+                       const uint8_t* dither4x4,

+                       int width,

+                       int height);

 LIBYUV_API

-int I420ToARGB1555(const uint8* src_y, int src_stride_y,

-                   const uint8* src_u, int src_stride_u,

-                   const uint8* src_v, int src_stride_v,

-                   uint8* dst_frame, int dst_stride_frame,

-                   int width, int height);

+int I420ToARGB1555(const uint8_t* src_y,

+                   int src_stride_y,

+                   const uint8_t* src_u,

+                   int src_stride_u,

+                   const uint8_t* src_v,

+                   int src_stride_v,

+                   uint8_t* dst_argb1555,

+                   int dst_stride_argb1555,

+                   int width,

+                   int height);

 LIBYUV_API

-int I420ToARGB4444(const uint8* src_y, int src_stride_y,

-                   const uint8* src_u, int src_stride_u,

-                   const uint8* src_v, int src_stride_v,

-                   uint8* dst_frame, int dst_stride_frame,

-                   int width, int height);

+int I420ToARGB4444(const uint8_t* src_y,

+                   int src_stride_y,

+                   const uint8_t* src_u,

+                   int src_stride_u,

+                   const uint8_t* src_v,

+                   int src_stride_v,

+                   uint8_t* dst_argb4444,

+                   int dst_stride_argb4444,

+                   int width,

+                   int height);

+// Convert I420 to AR30.

+LIBYUV_API

+int I420ToAR30(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height);

+// Convert H420 to AR30.

+LIBYUV_API

+int H420ToAR30(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height);

 // Convert I420 to specified format.

 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the

 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.

 LIBYUV_API

-int ConvertFromI420(const uint8* y, int y_stride,

-                    const uint8* u, int u_stride,

-                    const uint8* v, int v_stride,

-                    uint8* dst_sample, int dst_sample_stride,

-                    int width, int height,

-                    uint32 format);

+int ConvertFromI420(const uint8_t* y,

+                    int y_stride,

+                    const uint8_t* u,

+                    int u_stride,

+                    const uint8_t* v,

+                    int v_stride,

+                    uint8_t* dst_sample,

+                    int dst_sample_stride,

+                    int width,

+                    int height,

+                    uint32_t fourcc);

 #ifdef __cplusplus

 }  // extern "C"

@@ -176,4 +339,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_

--- a/third_party/libyuv/include/libyuv/convert_from_argb.h

+++ b/third_party/libyuv/include/libyuv/convert_from_argb.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_

 #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_

 #include "libyuv/basic_types.h"

@@ -21,166 +21,263 @@

 // Copy ARGB to ARGB.

 #define ARGBToARGB ARGBCopy

 LIBYUV_API

-int ARGBCopy(const uint8* src_argb, int src_stride_argb,

-             uint8* dst_argb, int dst_stride_argb,

-             int width, int height);

+int ARGBCopy(const uint8_t* src_argb,

+             int src_stride_argb,

+             uint8_t* dst_argb,

+             int dst_stride_argb,

+             int width,

+             int height);

 // Convert ARGB To BGRA.

 LIBYUV_API

-int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_bgra, int dst_stride_bgra,

-               int width, int height);

+int ARGBToBGRA(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_bgra,

+               int dst_stride_bgra,

+               int width,

+               int height);

 // Convert ARGB To ABGR.

 LIBYUV_API

-int ARGBToABGR(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height);

+int ARGBToABGR(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

 // Convert ARGB To RGBA.

 LIBYUV_API

-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_rgba, int dst_stride_rgba,

-               int width, int height);

+int ARGBToRGBA(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_rgba,

+               int dst_stride_rgba,

+               int width,

+               int height);

+// Aliases

+#define ARGBToAB30 ABGRToAR30

+#define ABGRToAB30 ARGBToAR30

+// Convert ABGR To AR30.

+LIBYUV_API

+int ABGRToAR30(const uint8_t* src_abgr,

+               int src_stride_abgr,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height);

+// Convert ARGB To AR30.

+LIBYUV_API

+int ARGBToAR30(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height);

 // Convert ARGB To RGB24.

 LIBYUV_API

-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,

-                uint8* dst_rgb24, int dst_stride_rgb24,

-                int width, int height);

+int ARGBToRGB24(const uint8_t* src_argb,

+                int src_stride_argb,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height);

 // Convert ARGB To RAW.

 LIBYUV_API

-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,

-              uint8* dst_rgb, int dst_stride_rgb,

-              int width, int height);

+int ARGBToRAW(const uint8_t* src_argb,

+              int src_stride_argb,

+              uint8_t* dst_raw,

+              int dst_stride_raw,

+              int width,

+              int height);

 // Convert ARGB To RGB565.

 LIBYUV_API

-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,

-                 uint8* dst_rgb565, int dst_stride_rgb565,

-                 int width, int height);

+int ARGBToRGB565(const uint8_t* src_argb,

+                 int src_stride_argb,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height);

 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).

 // Values in dither matrix from 0 to 7 recommended.

 // The order of the dither matrix is first byte is upper left.

 // TODO(fbarchard): Consider pointer to 2d array for dither4x4.

-// const uint8(*dither)[4][4];

+// const uint8_t(*dither)[4][4];

 LIBYUV_API

-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_rgb565, int dst_stride_rgb565,

-                       const uint8* dither4x4, int width, int height);

+int ARGBToRGB565Dither(const uint8_t* src_argb,

+                       int src_stride_argb,

+                       uint8_t* dst_rgb565,

+                       int dst_stride_rgb565,

+                       const uint8_t* dither4x4,

+                       int width,

+                       int height);

 // Convert ARGB To ARGB1555.

 LIBYUV_API

-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,

-                   uint8* dst_argb1555, int dst_stride_argb1555,

-                   int width, int height);

+int ARGBToARGB1555(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb1555,

+                   int dst_stride_argb1555,

+                   int width,

+                   int height);

 // Convert ARGB To ARGB4444.

 LIBYUV_API

-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,

-                   uint8* dst_argb4444, int dst_stride_argb4444,

-                   int width, int height);

+int ARGBToARGB4444(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb4444,

+                   int dst_stride_argb4444,

+                   int width,

+                   int height);

 // Convert ARGB To I444.

 LIBYUV_API

-int ARGBToI444(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int ARGBToI444(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert ARGB To I422.

 LIBYUV_API

-int ARGBToI422(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int ARGBToI422(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert ARGB To I420. (also in convert.h)

 LIBYUV_API

-int ARGBToI420(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int ARGBToI420(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert ARGB to J420. (JPeg full range I420).

 LIBYUV_API

-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yj, int dst_stride_yj,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int ARGBToJ420(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yj,

+               int dst_stride_yj,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert ARGB to J422.

 LIBYUV_API

-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yj, int dst_stride_yj,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int ARGBToJ422(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yj,

+               int dst_stride_yj,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

-// Convert ARGB To I411.

-LIBYUV_API

-int ARGBToI411(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

 // Convert ARGB to J400. (JPeg full range).

 LIBYUV_API

-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yj, int dst_stride_yj,

-               int width, int height);

+int ARGBToJ400(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yj,

+               int dst_stride_yj,

+               int width,

+               int height);

 // Convert ARGB to I400.

 LIBYUV_API

-int ARGBToI400(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height);

+int ARGBToI400(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height);

 // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)

 LIBYUV_API

-int ARGBToG(const uint8* src_argb, int src_stride_argb,

-            uint8* dst_g, int dst_stride_g,

-            int width, int height);

+int ARGBToG(const uint8_t* src_argb,

+            int src_stride_argb,

+            uint8_t* dst_g,

+            int dst_stride_g,

+            int width,

+            int height);

 // Convert ARGB To NV12.

 LIBYUV_API

-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height);

+int ARGBToNV12(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height);

 // Convert ARGB To NV21.

 LIBYUV_API

-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_vu, int dst_stride_vu,

-               int width, int height);

+int ARGBToNV21(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_vu,

+               int dst_stride_vu,

+               int width,

+               int height);

 // Convert ARGB To NV21.

 LIBYUV_API

-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_vu, int dst_stride_vu,

-               int width, int height);

+int ARGBToNV21(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_vu,

+               int dst_stride_vu,

+               int width,

+               int height);

 // Convert ARGB To YUY2.

 LIBYUV_API

-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yuy2, int dst_stride_yuy2,

-               int width, int height);

+int ARGBToYUY2(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yuy2,

+               int dst_stride_yuy2,

+               int width,

+               int height);

 // Convert ARGB To UYVY.

 LIBYUV_API

-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_uyvy, int dst_stride_uyvy,

-               int width, int height);

+int ARGBToUYVY(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_uyvy,

+               int dst_stride_uyvy,

+               int width,

+               int height);

 #ifdef __cplusplus

 }  // extern "C"

@@ -187,4 +284,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_

--- a/third_party/libyuv/include/libyuv/cpu_id.h

+++ b/third_party/libyuv/include/libyuv/cpu_id.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_CPU_ID_H_

 #define INCLUDE_LIBYUV_CPU_ID_H_

 #include "libyuv/basic_types.h"

@@ -31,46 +31,85 @@

 static const int kCpuHasSSE2 = 0x20;

 static const int kCpuHasSSSE3 = 0x40;

 static const int kCpuHasSSE41 = 0x80;

-static const int kCpuHasSSE42 = 0x100;

+static const int kCpuHasSSE42 = 0x100;  // unused at this time.

 static const int kCpuHasAVX = 0x200;

 static const int kCpuHasAVX2 = 0x400;

 static const int kCpuHasERMS = 0x800;

 static const int kCpuHasFMA3 = 0x1000;

-static const int kCpuHasAVX3 = 0x2000;

-// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.

+static const int kCpuHasF16C = 0x2000;

+static const int kCpuHasGFNI = 0x4000;

+static const int kCpuHasAVX512BW = 0x8000;

+static const int kCpuHasAVX512VL = 0x10000;

+static const int kCpuHasAVX512VBMI = 0x20000;

+static const int kCpuHasAVX512VBMI2 = 0x40000;

+static const int kCpuHasAVX512VBITALG = 0x80000;

+static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;

 // These flags are only valid on MIPS processors.

-static const int kCpuHasMIPS = 0x10000;

-static const int kCpuHasDSPR2 = 0x20000;

+static const int kCpuHasMIPS = 0x200000;

+static const int kCpuHasMSA = 0x400000;

-// Internal function used to auto-init.

+// Optional init function. TestCpuFlag does an auto-init.

+// Returns cpu_info flags.

 LIBYUV_API

 int InitCpuFlags(void);

-// Internal function for parsing /proc/cpuinfo.

-LIBYUV_API

-int ArmCpuCaps(const char* cpuinfo_name);

 // Detect CPU has SSE2 etc.

 // Test_flag parameter should be one of kCpuHas constants above.

-// returns non-zero if instruction set is detected

+// Returns non-zero if instruction set is detected

 static __inline int TestCpuFlag(int test_flag) {

   LIBYUV_API extern int cpu_info_;

-  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;

+#ifdef __ATOMIC_RELAXED

+  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);

+#else

+  int cpu_info = cpu_info_;

+#endif

+  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;

+// Internal function for parsing /proc/cpuinfo.

+LIBYUV_API

+int ArmCpuCaps(const char* cpuinfo_name);

 // For testing, allow CPU flags to be disabled.

 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.

 // MaskCpuFlags(-1) to enable all cpu specific optimizations.

 // MaskCpuFlags(1) to disable all cpu specific optimizations.

+// MaskCpuFlags(0) to reset state so next call will auto init.

+// Returns cpu_info flags.

 LIBYUV_API

-void MaskCpuFlags(int enable_flags);

+int MaskCpuFlags(int enable_flags);

+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|

+// should be a valid combination of the kCpuHas constants above and include

+// kCpuInitialized. Use this method when running in a sandboxed process where

+// the detection code might fail (as it might access /proc/cpuinfo). In such

+// cases the cpu_info can be obtained from a non sandboxed process by calling

+// InitCpuFlags() and passed to the sandboxed process (via command line

+// parameters, IPC...) which can then call this method to initialize the CPU

+// flags.

+// Notes:

+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled

+//   again.

+// - enabling CPU features that are not supported by the CPU will result in

+//   undefined behavior.

+// TODO(fbarchard): consider writing a helper function that translates from

+// other library CPU info to libyuv CPU info and add a .md doc that explains

+// CPU detection.

+static __inline void SetCpuFlags(int cpu_flags) {

+  LIBYUV_API extern int cpu_info_;

+#ifdef __ATOMIC_RELAXED

+  __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);

+#else

+  cpu_info_ = cpu_flags;

+#endif

+}

 // Low level cpuid for X86. Returns zeros on other CPUs.

 // eax is the info type that you want.

 // ecx is typically the cpu number, and should normally be zero.

 LIBYUV_API

-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);

+void CpuId(int info_eax, int info_ecx, int* cpu_info);

 #ifdef __cplusplus

 }  // extern "C"

@@ -77,4 +116,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_CPU_ID_H_

--- /dev/null

+++ b/third_party/libyuv/include/libyuv/macros_msa.h

@@ -1,0 +1,233 @@

+/*

+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_

+#define INCLUDE_LIBYUV_MACROS_MSA_H_

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#include <msa.h>

+#include <stdint.h>

+#if (__mips_isa_rev >= 6)

+#define LW(psrc)                                        \

+  ({                                                    \

+    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc);  \

+    uint32_t val_m;                                     \

+    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"      \

+                 : [val_m] "=r"(val_m)                  \

+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \

+    val_m;                                              \

+  })

+#if (__mips == 64)

+#define LD(psrc)                                        \

+  ({                                                    \

+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);  \

+    uint64_t val_m = 0;                                 \

+    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"      \

+                 : [val_m] "=r"(val_m)                  \

+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \

+    val_m;                                              \

+  })

+#else  // !(__mips == 64)

+#define LD(psrc)                                                         \

+  ({                                                                     \

+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \

+    uint32_t val0_m, val1_m;                                             \

+    uint64_t val_m = 0;                                                  \

+    val0_m = LW(psrc_ld_m);                                              \

+    val1_m = LW(psrc_ld_m + 4);                                          \

+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \

+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \

+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \

+    val_m;                                                               \

+  })

+#endif  // (__mips == 64)

+#define SW(val, pdst)                                   \

+  ({                                                    \

+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \

+    uint32_t val_m = (val);                             \

+    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \

+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \

+                 : [val_m] "r"(val_m));                 \

+  })

+#if (__mips == 64)

+#define SD(val, pdst)                                   \

+  ({                                                    \

+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \

+    uint64_t val_m = (val);                             \

+    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \

+                 : [pdst_sd_m] "=m"(*pdst_sd_m)         \

+                 : [val_m] "r"(val_m));                 \

+  })

+#else  // !(__mips == 64)

+#define SD(val, pdst)                                        \

+  ({                                                         \

+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \

+    uint32_t val0_m, val1_m;                                 \

+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \

+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \

+    SW(val0_m, pdst_sd_m);                                   \

+    SW(val1_m, pdst_sd_m + 4);                               \

+  })

+#endif  // !(__mips == 64)

+#else   // !(__mips_isa_rev >= 6)

+#define LW(psrc)                                        \

+  ({                                                    \

+    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc);  \

+    uint32_t val_m;                                     \

+    asm volatile("ulw  %[val_m],  %[psrc_lw_m]  \n"     \

+                 : [val_m] "=r"(val_m)                  \

+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \

+    val_m;                                              \

+  })

+#if (__mips == 64)

+#define LD(psrc)                                        \

+  ({                                                    \

+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);  \

+    uint64_t val_m = 0;                                 \

+    asm volatile("uld  %[val_m],  %[psrc_ld_m]  \n"     \

+                 : [val_m] "=r"(val_m)                  \

+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \

+    val_m;                                              \

+  })

+#else  // !(__mips == 64)

+#define LD(psrc)                                                         \

+  ({                                                                     \

+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \

+    uint32_t val0_m, val1_m;                                             \

+    uint64_t val_m = 0;                                                  \

+    val0_m = LW(psrc_ld_m);                                              \

+    val1_m = LW(psrc_ld_m + 4);                                          \

+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \

+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \

+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \

+    val_m;                                                               \

+  })

+#endif  // (__mips == 64)

+#define SW(val, pdst)                                   \

+  ({                                                    \

+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \

+    uint32_t val_m = (val);                             \

+    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \

+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \

+                 : [val_m] "r"(val_m));                 \

+  })

+#define SD(val, pdst)                                        \

+  ({                                                         \

+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \

+    uint32_t val0_m, val1_m;                                 \

+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \

+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \

+    SW(val0_m, pdst_sd_m);                                   \

+    SW(val1_m, pdst_sd_m + 4);                               \

+  })

+#endif  // (__mips_isa_rev >= 6)

+// TODO(fbarchard): Consider removing __VAR_ARGS versions.

+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */

+#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)

+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */

+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)

+#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */

+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)

+/* Description : Load two vectors with 16 'byte' sized elements

+   Arguments   : Inputs  - psrc, stride

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Load 16 byte elements in 'out0' from (psrc)

+                 Load 16 byte elements in 'out1' from (psrc + stride)

+*/

+#define LD_B2(RTYPE, psrc, stride, out0, out1) \

+  {                                            \

+    out0 = LD_B(RTYPE, (psrc));                \

+    out1 = LD_B(RTYPE, (psrc) + stride);       \

+  }

+#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)

+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \

+  {                                                        \

+    LD_B2(RTYPE, (psrc), stride, out0, out1);              \

+    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \

+  }

+#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)

+/* Description : Store two vectors with stride each having 16 'byte' sized

+                 elements

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 16 byte elements from 'in0' to (pdst)

+                 Store 16 byte elements from 'in1' to (pdst + stride)

+*/

+#define ST_B2(RTYPE, in0, in1, pdst, stride) \

+  {                                          \

+    ST_B(RTYPE, in0, (pdst));                \

+    ST_B(RTYPE, in1, (pdst) + stride);       \

+  }

+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)

+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \

+  {                                                      \

+    ST_B2(RTYPE, in0, in1, (pdst), stride);              \

+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \

+  }

+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)

+/* Description : Store vectors of 8 halfword elements with stride

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 8 halfword elements from 'in0' to (pdst)

+                 Store 8 halfword elements from 'in1' to (pdst + stride)

+*/

+#define ST_H2(RTYPE, in0, in1, pdst, stride) \

+  {                                          \

+    ST_H(RTYPE, in0, (pdst));                \

+    ST_H(RTYPE, in1, (pdst) + stride);       \

+  }

+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)

+// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.

+/* Description : Shuffle byte vector elements as per mask vector

+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to

+                 'out0' as per control vector 'mask0'

+*/

+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \

+  {                                                                   \

+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \

+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \

+  }

+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)

+/* Description : Interleave both left and right half of input vectors

+   Arguments   : Inputs  - in0, in1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Right half of byte elements from 'in0' and 'in1' are

+                 interleaved and written to 'out0'

+*/

+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \

+  {                                                     \

+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \

+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \

+  }

+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)

+#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */

+#endif  // INCLUDE_LIBYUV_MACROS_MSA_H_

--- a/third_party/libyuv/include/libyuv/mjpeg_decoder.h

+++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_

 #define INCLUDE_LIBYUV_MJPEG_DECODER_H_

 #include "libyuv/basic_types.h"

@@ -26,18 +26,17 @@

 extern "C" {

 #endif

-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);

+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);

 #ifdef __cplusplus

 }  // extern "C"

 #endif

-static const uint32 kUnknownDataSize = 0xFFFFFFFF;

+static const uint32_t kUnknownDataSize = 0xFFFFFFFF;

 enum JpegSubsamplingType {

   kJpegYuv420,

   kJpegYuv422,

-  kJpegYuv411,

   kJpegYuv444,

   kJpegYuv400,

   kJpegUnknown

@@ -44,7 +43,7 @@

};

 struct Buffer {

-  const uint8* data;

+  const uint8_t* data;

   int len;

};

@@ -66,7 +65,7 @@

 class LIBYUV_API MJpegDecoder {

  public:

   typedef void (*CallbackFunction)(void* opaque,

-                                   const uint8* const* data,

+                                   const uint8_t* const* data,

                                    const int* strides,

                                    int rows);

@@ -86,7 +85,7 @@

   // If return value is LIBYUV_TRUE, then the values for all the following

   // getters are populated.

   // src_len is the size of the compressed mjpeg frame in bytes.

-  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);

+  LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);

   // Returns width of the last loaded frame in pixels.

   int GetWidth();

@@ -139,18 +138,22 @@

   // at least GetComponentSize(i). The pointers in planes are incremented

   // to point to after the end of the written data.

   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.

-  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);

+  LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);

   // Decodes the entire image and passes the data via repeated calls to a

   // callback function. Each call will get the data for a whole number of

   // image scanlines.

   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.

-  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,

-                        int dst_width, int dst_height);

+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,

+                               void* opaque,

+                               int dst_width,

+                               int dst_height);

   // The helper function which recognizes the jpeg sub-sampling type.

   static JpegSubsamplingType JpegSubsamplingTypeHelper(

-     int* subsample_x, int* subsample_y, int number_of_components);

+      int* subsample_x,

+      int* subsample_y,

+      int number_of_components);

  private:

   void AllocOutputBuffers(int num_outbufs);

@@ -159,7 +162,7 @@

   LIBYUV_BOOL StartDecode();

   LIBYUV_BOOL FinishDecode();

-  void SetScanlinePointers(uint8** data);

+  void SetScanlinePointers(uint8_t** data);

   LIBYUV_BOOL DecodeImcuRow();

   int GetComponentScanlinePadding(int component);

@@ -178,11 +181,11 @@

   // Temporaries used to point to scanline outputs.

   int num_outbufs_;  // Outermost size of all arrays below.

-  uint8*** scanlines_;

+  uint8_t*** scanlines_;

   int* scanlines_sizes_;

   // Temporary buffer used for decoding when we can't decode directly to the

   // output buffers. Large enough for just one iMCU row.

-  uint8** databuf_;

+  uint8_t** databuf_;

   int* databuf_strides_;

};

@@ -189,4 +192,4 @@

 }  // namespace libyuv

 #endif  //  __cplusplus

-#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_

--- a/third_party/libyuv/include/libyuv/planar_functions.h

+++ b/third_party/libyuv/include/libyuv/planar_functions.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_

 #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_

 #include "libyuv/basic_types.h"

@@ -22,42 +22,123 @@

 extern "C" {

 #endif

+// TODO(fbarchard): Move cpu macros to row.h

+#if defined(__pnacl__) || defined(__CLR_VER) ||            \

+    (defined(__native_client__) && defined(__x86_64__)) || \

+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))

+#define LIBYUV_DISABLE_X86

+#endif

+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

+#if defined(__has_feature)

+#if __has_feature(memory_sanitizer)

+#define LIBYUV_DISABLE_X86

+#endif

+#endif

+// The following are available on all x86 platforms:

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

+#define HAS_ARGBAFFINEROW_SSE2

+#endif

 // Copy a plane of data.

 LIBYUV_API

-void CopyPlane(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height);

+void CopyPlane(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height);

 LIBYUV_API

-void CopyPlane_16(const uint16* src_y, int src_stride_y,

-                  uint16* dst_y, int dst_stride_y,

-                  int width, int height);

+void CopyPlane_16(const uint16_t* src_y,

+                  int src_stride_y,

+                  uint16_t* dst_y,

+                  int dst_stride_y,

+                  int width,

+                  int height);

+LIBYUV_API

+void Convert16To8Plane(const uint16_t* src_y,

+                       int src_stride_y,

+                       uint8_t* dst_y,

+                       int dst_stride_y,

+                       int scale,  // 16384 for 10 bits

+                       int width,

+                       int height);

+LIBYUV_API

+void Convert8To16Plane(const uint8_t* src_y,

+                       int src_stride_y,

+                       uint16_t* dst_y,

+                       int dst_stride_y,

+                       int scale,  // 1024 for 10 bits

+                       int width,

+                       int height);

 // Set a plane of data to a 32 bit value.

 LIBYUV_API

-void SetPlane(uint8* dst_y, int dst_stride_y,

-              int width, int height,

-              uint32 value);

+void SetPlane(uint8_t* dst_y,

+              int dst_stride_y,

+              int width,

+              int height,

+              uint32_t value);

 // Split interleaved UV plane into separate U and V planes.

 LIBYUV_API

-void SplitUVPlane(const uint8* src_uv, int src_stride_uv,

-                  uint8* dst_u, int dst_stride_u,

-                  uint8* dst_v, int dst_stride_v,

-                  int width, int height);

+void SplitUVPlane(const uint8_t* src_uv,

+                  int src_stride_uv,

+                  uint8_t* dst_u,

+                  int dst_stride_u,

+                  uint8_t* dst_v,

+                  int dst_stride_v,

+                  int width,

+                  int height);

 // Merge separate U and V planes into one interleaved UV plane.

 LIBYUV_API

-void MergeUVPlane(const uint8* src_u, int src_stride_u,

-                  const uint8* src_v, int src_stride_v,

-                  uint8* dst_uv, int dst_stride_uv,

-                  int width, int height);

+void MergeUVPlane(const uint8_t* src_u,

+                  int src_stride_u,

+                  const uint8_t* src_v,

+                  int src_stride_v,

+                  uint8_t* dst_uv,

+                  int dst_stride_uv,

+                  int width,

+                  int height);

+// Split interleaved RGB plane into separate R, G and B planes.

+LIBYUV_API

+void SplitRGBPlane(const uint8_t* src_rgb,

+                   int src_stride_rgb,

+                   uint8_t* dst_r,

+                   int dst_stride_r,

+                   uint8_t* dst_g,

+                   int dst_stride_g,

+                   uint8_t* dst_b,

+                   int dst_stride_b,

+                   int width,

+                   int height);

+// Merge separate R, G and B planes into one interleaved RGB plane.

+LIBYUV_API

+void MergeRGBPlane(const uint8_t* src_r,

+                   int src_stride_r,

+                   const uint8_t* src_g,

+                   int src_stride_g,

+                   const uint8_t* src_b,

+                   int src_stride_b,

+                   uint8_t* dst_rgb,

+                   int dst_stride_rgb,

+                   int width,

+                   int height);

 // Copy I400.  Supports inverting.

 LIBYUV_API

-int I400ToI400(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height);

+int I400ToI400(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height);

 #define J400ToJ400 I400ToI400

@@ -64,60 +145,105 @@

 // Copy I422 to I422.

 #define I422ToI422 I422Copy

 LIBYUV_API

-int I422Copy(const uint8* src_y, int src_stride_y,

-             const uint8* src_u, int src_stride_u,

-             const uint8* src_v, int src_stride_v,

-             uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int width, int height);

+int I422Copy(const uint8_t* src_y,

+             int src_stride_y,

+             const uint8_t* src_u,

+             int src_stride_u,

+             const uint8_t* src_v,

+             int src_stride_v,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height);

 // Copy I444 to I444.

 #define I444ToI444 I444Copy

 LIBYUV_API

-int I444Copy(const uint8* src_y, int src_stride_y,

-             const uint8* src_u, int src_stride_u,

-             const uint8* src_v, int src_stride_v,

-             uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int width, int height);

+int I444Copy(const uint8_t* src_y,

+             int src_stride_y,

+             const uint8_t* src_u,

+             int src_stride_u,

+             const uint8_t* src_v,

+             int src_stride_v,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height);

 // Convert YUY2 to I422.

 LIBYUV_API

-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int YUY2ToI422(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Convert UYVY to I422.

 LIBYUV_API

-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int UYVYToI422(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 LIBYUV_API

-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height);

+int YUY2ToNV12(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height);

 LIBYUV_API

-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height);

+int UYVYToNV12(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height);

+LIBYUV_API

+int YUY2ToY(const uint8_t* src_yuy2,

+            int src_stride_yuy2,

+            uint8_t* dst_y,

+            int dst_stride_y,

+            int width,

+            int height);

 // Convert I420 to I400. (calls CopyPlane ignoring u/v).

 LIBYUV_API

-int I420ToI400(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height);

+int I420ToI400(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height);

 // Alias

 #define J420ToJ400 I420ToI400

@@ -125,13 +251,20 @@

 // I420 mirror.

 LIBYUV_API

-int I420Mirror(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height);

+int I420Mirror(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height);

 // Alias

 #define I400ToI400Mirror I400Mirror

@@ -139,9 +272,12 @@

 // I400 mirror.  A single plane is mirrored horizontally.

 // Pass negative height to achieve 180 degree rotation.

 LIBYUV_API

-int I400Mirror(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height);

+int I400Mirror(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height);

 // Alias

 #define ARGBToARGBMirror ARGBMirror

@@ -148,78 +284,127 @@

 // ARGB mirror.

 LIBYUV_API

-int ARGBMirror(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int ARGBMirror(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Convert NV12 to RGB565.

 LIBYUV_API

-int NV12ToRGB565(const uint8* src_y, int src_stride_y,

-                 const uint8* src_uv, int src_stride_uv,

-                 uint8* dst_rgb565, int dst_stride_rgb565,

-                 int width, int height);

+int NV12ToRGB565(const uint8_t* src_y,

+                 int src_stride_y,

+                 const uint8_t* src_uv,

+                 int src_stride_uv,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height);

 // I422ToARGB is in convert_argb.h

 // Convert I422 to BGRA.

 LIBYUV_API

-int I422ToBGRA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_bgra, int dst_stride_bgra,

-               int width, int height);

+int I422ToBGRA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_bgra,

+               int dst_stride_bgra,

+               int width,

+               int height);

 // Convert I422 to ABGR.

 LIBYUV_API

-int I422ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height);

+int I422ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height);

 // Convert I422 to RGBA.

 LIBYUV_API

-int I422ToRGBA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_rgba, int dst_stride_rgba,

-               int width, int height);

+int I422ToRGBA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_rgba,

+               int dst_stride_rgba,

+               int width,

+               int height);

 // Alias

 #define RGB24ToRAW RAWToRGB24

 LIBYUV_API

-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,

-               uint8* dst_rgb24, int dst_stride_rgb24,

-               int width, int height);

+int RAWToRGB24(const uint8_t* src_raw,

+               int src_stride_raw,

+               uint8_t* dst_rgb24,

+               int dst_stride_rgb24,

+               int width,

+               int height);

 // Draw a rectangle into I420.

 LIBYUV_API

-int I420Rect(uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int x, int y, int width, int height,

-             int value_y, int value_u, int value_v);

+int I420Rect(uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int x,

+             int y,

+             int width,

+             int height,

+             int value_y,

+             int value_u,

+             int value_v);

 // Draw a rectangle into ARGB.

 LIBYUV_API

-int ARGBRect(uint8* dst_argb, int dst_stride_argb,

-             int x, int y, int width, int height, uint32 value);

+int ARGBRect(uint8_t* dst_argb,

+             int dst_stride_argb,

+             int dst_x,

+             int dst_y,

+             int width,

+             int height,

+             uint32_t value);

 // Convert ARGB to gray scale ARGB.

 LIBYUV_API

-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height);

+int ARGBGrayTo(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height);

 // Make a rectangle of ARGB gray scale.

 LIBYUV_API

-int ARGBGray(uint8* dst_argb, int dst_stride_argb,

-             int x, int y, int width, int height);

+int ARGBGray(uint8_t* dst_argb,

+             int dst_stride_argb,

+             int dst_x,

+             int dst_y,

+             int width,

+             int height);

 // Make a rectangle of ARGB Sepia tone.

 LIBYUV_API

-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,

-              int x, int y, int width, int height);

+int ARGBSepia(uint8_t* dst_argb,

+              int dst_stride_argb,

+              int dst_x,

+              int dst_y,

+              int width,

+              int height);

 // Apply a matrix rotation to each ARGB pixel.

 // matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.

@@ -228,10 +413,13 @@

 // The next 4 coefficients apply to B, G, R, A and produce R of the output.

 // The last 4 coefficients apply to B, G, R, A and produce A of the output.

 LIBYUV_API

-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,

-                    uint8* dst_argb, int dst_stride_argb,

-                    const int8* matrix_argb,

-                    int width, int height);

+int ARGBColorMatrix(const uint8_t* src_argb,

+                    int src_stride_argb,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    const int8_t* matrix_argb,

+                    int width,

+                    int height);

 // Deprecated. Use ARGBColorMatrix instead.

 // Apply a matrix rotation to each ARGB pixel.

@@ -240,32 +428,47 @@

 // The next 4 coefficients apply to B, G, R, A and produce G of the output.

 // The last 4 coefficients apply to B, G, R, A and produce R of the output.

 LIBYUV_API

-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,

-                   const int8* matrix_rgb,

-                   int x, int y, int width, int height);

+int RGBColorMatrix(uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   const int8_t* matrix_rgb,

+                   int dst_x,

+                   int dst_y,

+                   int width,

+                   int height);

 // Apply a color table each ARGB pixel.

 // Table contains 256 ARGB values.

 LIBYUV_API

-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,

-                   const uint8* table_argb,

-                   int x, int y, int width, int height);

+int ARGBColorTable(uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   const uint8_t* table_argb,

+                   int dst_x,

+                   int dst_y,

+                   int width,

+                   int height);

 // Apply a color table each ARGB pixel but preserve destination alpha.

 // Table contains 256 ARGB values.

 LIBYUV_API

-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,

-                  const uint8* table_argb,

-                  int x, int y, int width, int height);

+int RGBColorTable(uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  const uint8_t* table_argb,

+                  int dst_x,

+                  int dst_y,

+                  int width,

+                  int height);

 // Apply a luma/color table each ARGB pixel but preserve destination alpha.

 // Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from

 // RGB (YJ style) and C is an 8 bit color component (R, G or B).

 LIBYUV_API

-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_argb, int dst_stride_argb,

-                       const uint8* luma_rgb_table,

-                       int width, int height);

+int ARGBLumaColorTable(const uint8_t* src_argb,

+                       int src_stride_argb,

+                       uint8_t* dst_argb,

+                       int dst_stride_argb,

+                       const uint8_t* luma,

+                       int width,

+                       int height);

 // Apply a 3 term polynomial to ARGB values.

 // poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is

@@ -276,46 +479,84 @@

 // A polynomial approximation can be dirived using software such as 'R'.

 LIBYUV_API

-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,

-                   uint8* dst_argb, int dst_stride_argb,

+int ARGBPolynomial(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

                    const float* poly,

-                   int width, int height);

+                   int width,

+                   int height);

+// Convert plane of 16 bit shorts to half floats.

+// Source values are multiplied by scale before storing as half float.

+LIBYUV_API

+int HalfFloatPlane(const uint16_t* src_y,

+                   int src_stride_y,

+                   uint16_t* dst_y,

+                   int dst_stride_y,

+                   float scale,

+                   int width,

+                   int height);

+// Convert a buffer of bytes to floats, scale the values and store as floats.

+LIBYUV_API

+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);

 // Quantize a rectangle of ARGB. Alpha unaffected.

 // scale is a 16 bit fractional fixed point scaler between 0 and 65535.

 // interval_size should be a value between 1 and 255.

 // interval_offset should be a value between 0 and 255.

 LIBYUV_API

-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,

-                 int scale, int interval_size, int interval_offset,

-                 int x, int y, int width, int height);

+int ARGBQuantize(uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int scale,

+                 int interval_size,

+                 int interval_offset,

+                 int dst_x,

+                 int dst_y,

+                 int width,

+                 int height);

 // Copy ARGB to ARGB.

 LIBYUV_API

-int ARGBCopy(const uint8* src_argb, int src_stride_argb,

-             uint8* dst_argb, int dst_stride_argb,

-             int width, int height);

+int ARGBCopy(const uint8_t* src_argb,

+             int src_stride_argb,

+             uint8_t* dst_argb,

+             int dst_stride_argb,

+             int width,

+             int height);

 // Copy Alpha channel of ARGB to alpha of ARGB.

 LIBYUV_API

-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,

-                  uint8* dst_argb, int dst_stride_argb,

-                  int width, int height);

+int ARGBCopyAlpha(const uint8_t* src_argb,

+                  int src_stride_argb,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int width,

+                  int height);

 // Extract the alpha channel from ARGB.

 LIBYUV_API

-int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,

-                     uint8* dst_a, int dst_stride_a,

-                     int width, int height);

+int ARGBExtractAlpha(const uint8_t* src_argb,

+                     int src_stride_argb,

+                     uint8_t* dst_a,

+                     int dst_stride_a,

+                     int width,

+                     int height);

 // Copy Y channel to Alpha of ARGB.

 LIBYUV_API

-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,

-                     uint8* dst_argb, int dst_stride_argb,

-                     int width, int height);

+int ARGBCopyYToAlpha(const uint8_t* src_y,

+                     int src_stride_y,

+                     uint8_t* dst_argb,

+                     int dst_stride_argb,

+                     int width,

+                     int height);

-typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,

-                             uint8* dst_argb, int width);

+typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,

+                             const uint8_t* src_argb1,

+                             uint8_t* dst_argb,

+                             int width);

 // Get function to Alpha Blend ARGB pixels and store to destination.

 LIBYUV_API

@@ -325,92 +566,143 @@

 // Source is pre-multiplied by alpha using ARGBAttenuate.

 // Alpha of destination is set to 255.

 LIBYUV_API

-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,

-              const uint8* src_argb1, int src_stride_argb1,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height);

+int ARGBBlend(const uint8_t* src_argb0,

+              int src_stride_argb0,

+              const uint8_t* src_argb1,

+              int src_stride_argb1,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height);

 // Alpha Blend plane and store to destination.

 // Source is not pre-multiplied by alpha.

 LIBYUV_API

-int BlendPlane(const uint8* src_y0, int src_stride_y0,

-               const uint8* src_y1, int src_stride_y1,

-               const uint8* alpha, int alpha_stride,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height);

+int BlendPlane(const uint8_t* src_y0,

+               int src_stride_y0,

+               const uint8_t* src_y1,

+               int src_stride_y1,

+               const uint8_t* alpha,

+               int alpha_stride,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height);

 // Alpha Blend YUV images and store to destination.

 // Source is not pre-multiplied by alpha.

 // Alpha is full width x height and subsampled to half size to apply to UV.

 LIBYUV_API

-int I420Blend(const uint8* src_y0, int src_stride_y0,

-              const uint8* src_u0, int src_stride_u0,

-              const uint8* src_v0, int src_stride_v0,

-              const uint8* src_y1, int src_stride_y1,

-              const uint8* src_u1, int src_stride_u1,

-              const uint8* src_v1, int src_stride_v1,

-              const uint8* alpha, int alpha_stride,

-              uint8* dst_y, int dst_stride_y,

-              uint8* dst_u, int dst_stride_u,

-              uint8* dst_v, int dst_stride_v,

-              int width, int height);

+int I420Blend(const uint8_t* src_y0,

+              int src_stride_y0,

+              const uint8_t* src_u0,

+              int src_stride_u0,

+              const uint8_t* src_v0,

+              int src_stride_v0,

+              const uint8_t* src_y1,

+              int src_stride_y1,

+              const uint8_t* src_u1,

+              int src_stride_u1,

+              const uint8_t* src_v1,

+              int src_stride_v1,

+              const uint8_t* alpha,

+              int alpha_stride,

+              uint8_t* dst_y,

+              int dst_stride_y,

+              uint8_t* dst_u,

+              int dst_stride_u,

+              uint8_t* dst_v,

+              int dst_stride_v,

+              int width,

+              int height);

 // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.

 LIBYUV_API

-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,

-                 const uint8* src_argb1, int src_stride_argb1,

-                 uint8* dst_argb, int dst_stride_argb,

-                 int width, int height);

+int ARGBMultiply(const uint8_t* src_argb0,

+                 int src_stride_argb0,

+                 const uint8_t* src_argb1,

+                 int src_stride_argb1,

+                 uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int width,

+                 int height);

 // Add ARGB image with ARGB image. Saturates to 255.

 LIBYUV_API

-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,

-            const uint8* src_argb1, int src_stride_argb1,

-            uint8* dst_argb, int dst_stride_argb,

-            int width, int height);

+int ARGBAdd(const uint8_t* src_argb0,

+            int src_stride_argb0,

+            const uint8_t* src_argb1,

+            int src_stride_argb1,

+            uint8_t* dst_argb,

+            int dst_stride_argb,

+            int width,

+            int height);

 // Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.

 LIBYUV_API

-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,

-                 const uint8* src_argb1, int src_stride_argb1,

-                 uint8* dst_argb, int dst_stride_argb,

-                 int width, int height);

+int ARGBSubtract(const uint8_t* src_argb0,

+                 int src_stride_argb0,

+                 const uint8_t* src_argb1,

+                 int src_stride_argb1,

+                 uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int width,

+                 int height);

 // Convert I422 to YUY2.

 LIBYUV_API

-int I422ToYUY2(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_frame, int dst_stride_frame,

-               int width, int height);

+int I422ToYUY2(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_yuy2,

+               int dst_stride_yuy2,

+               int width,

+               int height);

 // Convert I422 to UYVY.

 LIBYUV_API

-int I422ToUYVY(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_frame, int dst_stride_frame,

-               int width, int height);

+int I422ToUYVY(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_uyvy,

+               int dst_stride_uyvy,

+               int width,

+               int height);

 // Convert unattentuated ARGB to preattenuated ARGB.

 LIBYUV_API

-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,

-                  uint8* dst_argb, int dst_stride_argb,

-                  int width, int height);

+int ARGBAttenuate(const uint8_t* src_argb,

+                  int src_stride_argb,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int width,

+                  int height);

 // Convert preattentuated ARGB to unattenuated ARGB.

 LIBYUV_API

-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,

-                    uint8* dst_argb, int dst_stride_argb,

-                    int width, int height);

+int ARGBUnattenuate(const uint8_t* src_argb,

+                    int src_stride_argb,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    int width,

+                    int height);

 // Internal function - do not call directly.

 // Computes table of cumulative sum for image where the value is the sum

 // of all values above and to the left of the entry. Used by ARGBBlur.

 LIBYUV_API

-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,

-                             int32* dst_cumsum, int dst_stride32_cumsum,

-                             int width, int height);

+int ARGBComputeCumulativeSum(const uint8_t* src_argb,

+                             int src_stride_argb,

+                             int32_t* dst_cumsum,

+                             int dst_stride32_cumsum,

+                             int width,

+                             int height);

 // Blur ARGB image.

 // dst_cumsum table of width * (height + 1) * 16 bytes aligned to

@@ -419,16 +711,25 @@

 // radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.

 // Blur is optimized for radius of 5 (11x11) or less.

 LIBYUV_API

-int ARGBBlur(const uint8* src_argb, int src_stride_argb,

-             uint8* dst_argb, int dst_stride_argb,

-             int32* dst_cumsum, int dst_stride32_cumsum,

-             int width, int height, int radius);

+int ARGBBlur(const uint8_t* src_argb,

+             int src_stride_argb,

+             uint8_t* dst_argb,

+             int dst_stride_argb,

+             int32_t* dst_cumsum,

+             int dst_stride32_cumsum,

+             int width,

+             int height,

+             int radius);

 // Multiply ARGB image by ARGB value.

 LIBYUV_API

-int ARGBShade(const uint8* src_argb, int src_stride_argb,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height, uint32 value);

+int ARGBShade(const uint8_t* src_argb,

+              int src_stride_argb,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height,

+              uint32_t value);

 // Interpolate between two images using specified amount of interpolation

 // (0 to 255) and store to destination.

@@ -435,83 +736,108 @@

 // 'interpolation' is specified as 8 bit fraction where 0 means 100% src0

 // and 255 means 1% src0 and 99% src1.

 LIBYUV_API

-int InterpolatePlane(const uint8* src0, int src_stride0,

-                     const uint8* src1, int src_stride1,

-                     uint8* dst, int dst_stride,

-                     int width, int height, int interpolation);

+int InterpolatePlane(const uint8_t* src0,

+                     int src_stride0,

+                     const uint8_t* src1,

+                     int src_stride1,

+                     uint8_t* dst,

+                     int dst_stride,

+                     int width,

+                     int height,

+                     int interpolation);

 // Interpolate between two ARGB images using specified amount of interpolation

 // Internally calls InterpolatePlane with width * 4 (bpp).

 LIBYUV_API

-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,

-                    const uint8* src_argb1, int src_stride_argb1,

-                    uint8* dst_argb, int dst_stride_argb,

-                    int width, int height, int interpolation);

+int ARGBInterpolate(const uint8_t* src_argb0,

+                    int src_stride_argb0,

+                    const uint8_t* src_argb1,

+                    int src_stride_argb1,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    int width,

+                    int height,

+                    int interpolation);

 // Interpolate between two YUV images using specified amount of interpolation

 // Internally calls InterpolatePlane on each plane where the U and V planes

 // are half width and half height.

 LIBYUV_API

-int I420Interpolate(const uint8* src0_y, int src0_stride_y,

-                    const uint8* src0_u, int src0_stride_u,

-                    const uint8* src0_v, int src0_stride_v,

-                    const uint8* src1_y, int src1_stride_y,

-                    const uint8* src1_u, int src1_stride_u,

-                    const uint8* src1_v, int src1_stride_v,

-                    uint8* dst_y, int dst_stride_y,

-                    uint8* dst_u, int dst_stride_u,

-                    uint8* dst_v, int dst_stride_v,

-                    int width, int height, int interpolation);

+int I420Interpolate(const uint8_t* src0_y,

+                    int src0_stride_y,

+                    const uint8_t* src0_u,

+                    int src0_stride_u,

+                    const uint8_t* src0_v,

+                    int src0_stride_v,

+                    const uint8_t* src1_y,

+                    int src1_stride_y,

+                    const uint8_t* src1_u,

+                    int src1_stride_u,

+                    const uint8_t* src1_v,

+                    int src1_stride_v,

+                    uint8_t* dst_y,

+                    int dst_stride_y,

+                    uint8_t* dst_u,

+                    int dst_stride_u,

+                    uint8_t* dst_v,

+                    int dst_stride_v,

+                    int width,

+                    int height,

+                    int interpolation);

-#if defined(__pnacl__) || defined(__CLR_VER) || \

-    (defined(__i386__) && !defined(__SSE2__))

-#define LIBYUV_DISABLE_X86

-#endif

-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505

-#if defined(__has_feature)

-#if __has_feature(memory_sanitizer)

-#define LIBYUV_DISABLE_X86

-#endif

-#endif

-// The following are available on all x86 platforms:

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))

-#define HAS_ARGBAFFINEROW_SSE2

-#endif

 // Row function for copying pixels from a source with a slope to a row

 // of destination. Useful for scaling, rotation, mirror, texture mapping.

 LIBYUV_API

-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,

-                     uint8* dst_argb, const float* uv_dudv, int width);

+void ARGBAffineRow_C(const uint8_t* src_argb,

+                     int src_argb_stride,

+                     uint8_t* dst_argb,

+                     const float* uv_dudv,

+                     int width);

+// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h

 LIBYUV_API

-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

-                        uint8* dst_argb, const float* uv_dudv, int width);

+void ARGBAffineRow_SSE2(const uint8_t* src_argb,

+                        int src_argb_stride,

+                        uint8_t* dst_argb,

+                        const float* uv_dudv,

+                        int width);

 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.

 // shuffler is 16 bytes and must be aligned.

 LIBYUV_API

-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,

-                uint8* dst_argb, int dst_stride_argb,

-                const uint8* shuffler, int width, int height);

+int ARGBShuffle(const uint8_t* src_bgra,

+                int src_stride_bgra,

+                uint8_t* dst_argb,

+                int dst_stride_argb,

+                const uint8_t* shuffler,

+                int width,

+                int height);

 // Sobel ARGB effect with planar output.

 LIBYUV_API

-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,

-                     uint8* dst_y, int dst_stride_y,

-                     int width, int height);

+int ARGBSobelToPlane(const uint8_t* src_argb,

+                     int src_stride_argb,

+                     uint8_t* dst_y,

+                     int dst_stride_y,

+                     int width,

+                     int height);

 // Sobel ARGB effect.

 LIBYUV_API

-int ARGBSobel(const uint8* src_argb, int src_stride_argb,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height);

+int ARGBSobel(const uint8_t* src_argb,

+              int src_stride_argb,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height);

 // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.

 LIBYUV_API

-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,

-                uint8* dst_argb, int dst_stride_argb,

-                int width, int height);

+int ARGBSobelXY(const uint8_t* src_argb,

+                int src_stride_argb,

+                uint8_t* dst_argb,

+                int dst_stride_argb,

+                int width,

+                int height);

 #ifdef __cplusplus

 }  // extern "C"

@@ -518,4 +844,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_

--- a/third_party/libyuv/include/libyuv/rotate.h

+++ b/third_party/libyuv/include/libyuv/rotate.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_ROTATE_H_

 #define INCLUDE_LIBYUV_ROTATE_H_

 #include "libyuv/basic_types.h"

@@ -20,8 +20,8 @@

 // Supported rotation.

 typedef enum RotationMode {

-  kRotate0 = 0,  // No rotation.

-  kRotate90 = 90,  // Rotate 90 degrees clockwise.

+  kRotate0 = 0,      // No rotation.

+  kRotate90 = 90,    // Rotate 90 degrees clockwise.

   kRotate180 = 180,  // Rotate 180 degrees.

   kRotate270 = 270,  // Rotate 270 degrees clockwise.

@@ -33,50 +33,82 @@

 // Rotate I420 frame.

 LIBYUV_API

-int I420Rotate(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int src_width, int src_height, enum RotationMode mode);

+int I420Rotate(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height,

+               enum RotationMode mode);

 // Rotate NV12 input and store in I420.

 LIBYUV_API

-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,

-                     const uint8* src_uv, int src_stride_uv,

-                     uint8* dst_y, int dst_stride_y,

-                     uint8* dst_u, int dst_stride_u,

-                     uint8* dst_v, int dst_stride_v,

-                     int src_width, int src_height, enum RotationMode mode);

+int NV12ToI420Rotate(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_uv,

+                     int src_stride_uv,

+                     uint8_t* dst_y,

+                     int dst_stride_y,

+                     uint8_t* dst_u,

+                     int dst_stride_u,

+                     uint8_t* dst_v,

+                     int dst_stride_v,

+                     int width,

+                     int height,

+                     enum RotationMode mode);

 // Rotate a plane by 0, 90, 180, or 270.

 LIBYUV_API

-int RotatePlane(const uint8* src, int src_stride,

-                uint8* dst, int dst_stride,

-                int src_width, int src_height, enum RotationMode mode);

+int RotatePlane(const uint8_t* src,

+                int src_stride,

+                uint8_t* dst,

+                int dst_stride,

+                int width,

+                int height,

+                enum RotationMode mode);

 // Rotate planes by 90, 180, 270. Deprecated.

 LIBYUV_API

-void RotatePlane90(const uint8* src, int src_stride,

-                   uint8* dst, int dst_stride,

-                   int width, int height);

+void RotatePlane90(const uint8_t* src,

+                   int src_stride,

+                   uint8_t* dst,

+                   int dst_stride,

+                   int width,

+                   int height);

 LIBYUV_API

-void RotatePlane180(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride,

-                    int width, int height);

+void RotatePlane180(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height);

 LIBYUV_API

-void RotatePlane270(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride,

-                    int width, int height);

+void RotatePlane270(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height);

 LIBYUV_API

-void RotateUV90(const uint8* src, int src_stride,

-                uint8* dst_a, int dst_stride_a,

-                uint8* dst_b, int dst_stride_b,

-                int width, int height);

+void RotateUV90(const uint8_t* src,

+                int src_stride,

+                uint8_t* dst_a,

+                int dst_stride_a,

+                uint8_t* dst_b,

+                int dst_stride_b,

+                int width,

+                int height);

 // Rotations for when U and V are interleaved.

 // These functions take one input pointer and

@@ -83,16 +115,24 @@

 // split the data into two buffers while

 // rotating them. Deprecated.

 LIBYUV_API

-void RotateUV180(const uint8* src, int src_stride,

-                 uint8* dst_a, int dst_stride_a,

-                 uint8* dst_b, int dst_stride_b,

-                 int width, int height);

+void RotateUV180(const uint8_t* src,

+                 int src_stride,

+                 uint8_t* dst_a,

+                 int dst_stride_a,

+                 uint8_t* dst_b,

+                 int dst_stride_b,

+                 int width,

+                 int height);

 LIBYUV_API

-void RotateUV270(const uint8* src, int src_stride,

-                 uint8* dst_a, int dst_stride_a,

-                 uint8* dst_b, int dst_stride_b,

-                 int width, int height);

+void RotateUV270(const uint8_t* src,

+                 int src_stride,

+                 uint8_t* dst_a,

+                 int dst_stride_a,

+                 uint8_t* dst_b,

+                 int dst_stride_b,

+                 int width,

+                 int height);

 // The 90 and 270 functions are based on transposes.

 // Doing a transpose with reversing the read/write

@@ -99,15 +139,22 @@

 // order will result in a rotation by +- 90 degrees.

 // Deprecated.

 LIBYUV_API

-void TransposePlane(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride,

-                    int width, int height);

+void TransposePlane(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height);

 LIBYUV_API

-void TransposeUV(const uint8* src, int src_stride,

-                 uint8* dst_a, int dst_stride_a,

-                 uint8* dst_b, int dst_stride_b,

-                 int width, int height);

+void TransposeUV(const uint8_t* src,

+                 int src_stride,

+                 uint8_t* dst_a,

+                 int dst_stride_a,

+                 uint8_t* dst_b,

+                 int dst_stride_b,

+                 int width,

+                 int height);

 #ifdef __cplusplus

 }  // extern "C"

@@ -114,4 +161,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_ROTATE_H_

--- a/third_party/libyuv/include/libyuv/rotate_argb.h

+++ b/third_party/libyuv/include/libyuv/rotate_argb.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_

 #define INCLUDE_LIBYUV_ROTATE_ARGB_H_

 #include "libyuv/basic_types.h"

@@ -21,9 +21,13 @@

 // Rotate ARGB frame

 LIBYUV_API

-int ARGBRotate(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_argb, int dst_stride_argb,

-               int src_width, int src_height, enum RotationMode mode);

+int ARGBRotate(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int src_width,

+               int src_height,

+               enum RotationMode mode);

 #ifdef __cplusplus

 }  // extern "C"

@@ -30,4 +34,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_

--- a/third_party/libyuv/include/libyuv/rotate_row.h

+++ b/third_party/libyuv/include/libyuv/rotate_row.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_

 #define INCLUDE_LIBYUV_ROTATE_ROW_H_

 #include "libyuv/basic_types.h"

@@ -18,10 +18,14 @@

 extern "C" {

 #endif

-#if defined(__pnacl__) || defined(__CLR_VER) || \

-    (defined(__i386__) && !defined(__SSE2__))

+#if defined(__pnacl__) || defined(__CLR_VER) ||            \

+    (defined(__native_client__) && defined(__x86_64__)) || \

+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))

 #define LIBYUV_DISABLE_X86

 #endif

+#if defined(__native_client__)

+#define LIBYUV_DISABLE_NEON

+#endif

 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505

 #if defined(__has_feature)

 #if __has_feature(memory_sanitizer)

@@ -29,89 +33,158 @@

 #endif

 #endif

 // The following are available for Visual C and clangcl 32 bit:

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

 #define HAS_TRANSPOSEWX8_SSSE3

 #define HAS_TRANSPOSEUVWX8_SSE2

 #endif

-// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))

+// The following are available for GCC 32 or 64 bit:

+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))

 #define HAS_TRANSPOSEWX8_SSSE3

 #endif

-// The following are available for 64 bit GCC but not NaCL:

-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \

-    defined(__x86_64__)

+// The following are available for 64 bit GCC:

+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)

 #define HAS_TRANSPOSEWX8_FAST_SSSE3

 #define HAS_TRANSPOSEUVWX8_SSE2

 #endif

-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

+#if !defined(LIBYUV_DISABLE_NEON) && \

     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))

 #define HAS_TRANSPOSEWX8_NEON

 #define HAS_TRANSPOSEUVWX8_NEON

 #endif

-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \

-    defined(__mips__) && \

-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#define HAS_TRANSPOSEWX8_DSPR2

-#define HAS_TRANSPOSEUVWX8_DSPR2

-#endif  // defined(__mips__)

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#define HAS_TRANSPOSEWX16_MSA

+#define HAS_TRANSPOSEUVWX16_MSA

+#endif

-void TransposeWxH_C(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride, int width, int height);

+void TransposeWxH_C(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height);

-void TransposeWx8_C(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride, int width);

-void TransposeWx8_NEON(const uint8* src, int src_stride,

-                       uint8* dst, int dst_stride, int width);

-void TransposeWx8_SSSE3(const uint8* src, int src_stride,

-                        uint8* dst, int dst_stride, int width);

-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,

-                             uint8* dst, int dst_stride, int width);

-void TransposeWx8_DSPR2(const uint8* src, int src_stride,

-                        uint8* dst, int dst_stride, int width);

-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,

-                             uint8* dst, int dst_stride, int width);

+void TransposeWx8_C(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width);

+void TransposeWx16_C(const uint8_t* src,

+                     int src_stride,

+                     uint8_t* dst,

+                     int dst_stride,

+                     int width);

+void TransposeWx8_NEON(const uint8_t* src,

+                       int src_stride,

+                       uint8_t* dst,

+                       int dst_stride,

+                       int width);

+void TransposeWx8_SSSE3(const uint8_t* src,

+                        int src_stride,

+                        uint8_t* dst,

+                        int dst_stride,

+                        int width);

+void TransposeWx8_Fast_SSSE3(const uint8_t* src,

+                             int src_stride,

+                             uint8_t* dst,

+                             int dst_stride,

+                             int width);

+void TransposeWx16_MSA(const uint8_t* src,

+                       int src_stride,

+                       uint8_t* dst,

+                       int dst_stride,

+                       int width);

-void TransposeWx8_Any_NEON(const uint8* src, int src_stride,

-                           uint8* dst, int dst_stride, int width);

-void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,

-                            uint8* dst, int dst_stride, int width);

-void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,

-                                 uint8* dst, int dst_stride, int width);

-void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,

-                            uint8* dst, int dst_stride, int width);

+void TransposeWx8_Any_NEON(const uint8_t* src,

+                           int src_stride,

+                           uint8_t* dst,

+                           int dst_stride,

+                           int width);

+void TransposeWx8_Any_SSSE3(const uint8_t* src,

+                            int src_stride,

+                            uint8_t* dst,

+                            int dst_stride,

+                            int width);

+void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,

+                                 int src_stride,

+                                 uint8_t* dst,

+                                 int dst_stride,

+                                 int width);

+void TransposeWx16_Any_MSA(const uint8_t* src,

+                           int src_stride,

+                           uint8_t* dst,

+                           int dst_stride,

+                           int width);

-void TransposeUVWxH_C(const uint8* src, int src_stride,

-                      uint8* dst_a, int dst_stride_a,

-                      uint8* dst_b, int dst_stride_b,

-                      int width, int height);

+void TransposeUVWxH_C(const uint8_t* src,

+                      int src_stride,

+                      uint8_t* dst_a,

+                      int dst_stride_a,

+                      uint8_t* dst_b,

+                      int dst_stride_b,

+                      int width,

+                      int height);

-void TransposeUVWx8_C(const uint8* src, int src_stride,

-                      uint8* dst_a, int dst_stride_a,

-                      uint8* dst_b, int dst_stride_b, int width);

-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b, int width);

-void TransposeUVWx8_NEON(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b, int width);

-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,

-                          uint8* dst_a, int dst_stride_a,

-                          uint8* dst_b, int dst_stride_b, int width);

+void TransposeUVWx8_C(const uint8_t* src,

+                      int src_stride,

+                      uint8_t* dst_a,

+                      int dst_stride_a,

+                      uint8_t* dst_b,

+                      int dst_stride_b,

+                      int width);

+void TransposeUVWx16_C(const uint8_t* src,

+                       int src_stride,

+                       uint8_t* dst_a,

+                       int dst_stride_a,

+                       uint8_t* dst_b,

+                       int dst_stride_b,

+                       int width);

+void TransposeUVWx8_SSE2(const uint8_t* src,

+                         int src_stride,

+                         uint8_t* dst_a,

+                         int dst_stride_a,

+                         uint8_t* dst_b,

+                         int dst_stride_b,

+                         int width);

+void TransposeUVWx8_NEON(const uint8_t* src,

+                         int src_stride,

+                         uint8_t* dst_a,

+                         int dst_stride_a,

+                         uint8_t* dst_b,

+                         int dst_stride_b,

+                         int width);

+void TransposeUVWx16_MSA(const uint8_t* src,

+                         int src_stride,

+                         uint8_t* dst_a,

+                         int dst_stride_a,

+                         uint8_t* dst_b,

+                         int dst_stride_b,

+                         int width);

-void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,

-                             uint8* dst_a, int dst_stride_a,

-                             uint8* dst_b, int dst_stride_b, int width);

-void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,

-                             uint8* dst_a, int dst_stride_a,

-                             uint8* dst_b, int dst_stride_b, int width);

-void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,

-                              uint8* dst_a, int dst_stride_a,

-                              uint8* dst_b, int dst_stride_b, int width);

+void TransposeUVWx8_Any_SSE2(const uint8_t* src,

+                             int src_stride,

+                             uint8_t* dst_a,

+                             int dst_stride_a,

+                             uint8_t* dst_b,

+                             int dst_stride_b,

+                             int width);

+void TransposeUVWx8_Any_NEON(const uint8_t* src,

+                             int src_stride,

+                             uint8_t* dst_a,

+                             int dst_stride_a,

+                             uint8_t* dst_b,

+                             int dst_stride_b,

+                             int width);

+void TransposeUVWx16_Any_MSA(const uint8_t* src,

+                             int src_stride,

+                             uint8_t* dst_a,

+                             int dst_stride_a,

+                             uint8_t* dst_b,

+                             int dst_stride_b,

+                             int width);

 #ifdef __cplusplus

 }  // extern "C"

@@ -118,4 +191,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_

--- a/third_party/libyuv/include/libyuv/row.h

+++ b/third_party/libyuv/include/libyuv/row.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_ROW_H_

 #define INCLUDE_LIBYUV_ROW_H_

 #include <stdlib.h>  // For malloc.

@@ -20,27 +20,14 @@

 extern "C" {

 #endif

-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))

-#ifdef __cplusplus

-#define align_buffer_64(var, size)                                             \

-  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \

-  uint8* var = reinterpret_cast<uint8*>                                        \

-      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)

-#else

-#define align_buffer_64(var, size)                                             \

-  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \

-  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */

-#endif

-#define free_aligned_buffer_64(var) \

-  free(var##_mem);  \

-  var = 0

-#if defined(__pnacl__) || defined(__CLR_VER) || \

-    (defined(__i386__) && !defined(__SSE2__))

+#if defined(__pnacl__) || defined(__CLR_VER) ||            \

+    (defined(__native_client__) && defined(__x86_64__)) || \

+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))

 #define LIBYUV_DISABLE_X86

 #endif

+#if defined(__native_client__)

+#define LIBYUV_DISABLE_NEON

+#endif

 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505

 #if defined(__has_feature)

 #if __has_feature(memory_sanitizer)

@@ -47,14 +34,6 @@

 #define LIBYUV_DISABLE_X86

 #endif

 #endif

-// True if compiling for SSSE3 as a requirement.

-#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))

-#define LIBYUV_SSSE3_ONLY

-#endif

-#if defined(__native_client__)

-#define LIBYUV_DISABLE_NEON

-#endif

 // clang >= 3.5.0 required for Arm64.

 #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)

 #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))

@@ -76,9 +55,17 @@

 #endif  // clang >= 3.4

 #endif  // __clang__

+// clang >= 6.0.0 required for AVX512.

+// TODO(fbarchard): fix xcode 9 ios b/789.

+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))

+#if (__clang_major__ >= 7) && !defined(__APPLE_EMBEDDED_SIMULATOR__)

+#define CLANG_HAS_AVX512 1

+#endif  // clang >= 7

+#endif  // __clang__

 // Visual C 2012 required for AVX2.

-#if defined(_M_IX86) && !defined(__clang__) && \

-    defined(_MSC_VER) && _MSC_VER >= 1700

+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \

+    _MSC_VER >= 1700

 #define VISUALC_HAS_AVX2 1

 #endif  // VisualStudio >= 2012

@@ -90,8 +77,8 @@

 #define HAS_ABGRTOYROW_SSSE3

 #define HAS_ARGB1555TOARGBROW_SSE2

 #define HAS_ARGB4444TOARGBROW_SSE2

+#define HAS_ARGBEXTRACTALPHAROW_SSE2

 #define HAS_ARGBSETROW_X86

-#define HAS_ARGBSHUFFLEROW_SSE2

 #define HAS_ARGBSHUFFLEROW_SSSE3

 #define HAS_ARGBTOARGB1555ROW_SSE2

 #define HAS_ARGBTOARGB4444ROW_SSE2

@@ -104,12 +91,12 @@

 #define HAS_ARGBTOUVROW_SSSE3

 #define HAS_ARGBTOYJROW_SSSE3

 #define HAS_ARGBTOYROW_SSSE3

-#define HAS_ARGBEXTRACTALPHAROW_SSE2

 #define HAS_BGRATOUVROW_SSSE3

 #define HAS_BGRATOYROW_SSSE3

 #define HAS_COPYROW_ERMS

 #define HAS_COPYROW_SSE2

 #define HAS_H422TOARGBROW_SSSE3

+#define HAS_HALFFLOATROW_SSE2

 #define HAS_I400TOARGBROW_SSE2

 #define HAS_I422TOARGB1555ROW_SSSE3

 #define HAS_I422TOARGB4444ROW_SSSE3

@@ -126,8 +113,10 @@

 #define HAS_MIRRORROW_SSSE3

 #define HAS_MIRRORUVROW_SSSE3

 #define HAS_NV12TOARGBROW_SSSE3

+#define HAS_NV12TORGB24ROW_SSSE3

 #define HAS_NV12TORGB565ROW_SSSE3

 #define HAS_NV21TOARGBROW_SSSE3

+#define HAS_NV21TORGB24ROW_SSSE3

 #define HAS_RAWTOARGBROW_SSSE3

 #define HAS_RAWTORGB24ROW_SSSE3

 #define HAS_RAWTOYROW_SSSE3

@@ -180,22 +169,22 @@

 // The following functions fail on gcc/clang 32 bit with fpic and framepointer.

 // caveat: clangcl uses row_win.cc which works.

-#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \

-    !defined(__i386__) || defined(_MSC_VER)

-// TODO(fbarchard): fix build error on x86 debug

-// https://code.google.com/p/libyuv/issues/detail?id=524

+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \

+    defined(_MSC_VER)

 // TODO(fbarchard): fix build error on android_full_debug=1

 // https://code.google.com/p/libyuv/issues/detail?id=517

+#define HAS_I422ALPHATOARGBROW_SSSE3

 #endif

 #endif

 // The following are available on all x86 platforms, but

 // require VS2012, clang 3.4 or gcc 4.7.

-// The code supports NaCL but requires a new compiler and validator.

-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \

-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))

+#if !defined(LIBYUV_DISABLE_X86) &&                          \

+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \

+     defined(GCC_HAS_AVX2))

 #define HAS_ARGBCOPYALPHAROW_AVX2

 #define HAS_ARGBCOPYYTOALPHAROW_AVX2

+#define HAS_ARGBEXTRACTALPHAROW_AVX2

 #define HAS_ARGBMIRRORROW_AVX2

 #define HAS_ARGBPOLYNOMIALROW_AVX2

 #define HAS_ARGBSHUFFLEROW_AVX2

@@ -206,12 +195,9 @@

 #define HAS_ARGBTOYROW_AVX2

 #define HAS_COPYROW_AVX

 #define HAS_H422TOARGBROW_AVX2

+#define HAS_HALFFLOATROW_AVX2

+//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast

 #define HAS_I400TOARGBROW_AVX2

-#if !(defined(_DEBUG) && defined(__i386__))

-// TODO(fbarchard): fix build error on android_full_debug=1

-// https://code.google.com/p/libyuv/issues/detail?id=517

-#endif

-#define HAS_I411TOARGBROW_AVX2

 #define HAS_I422TOARGB1555ROW_AVX2

 #define HAS_I422TOARGB4444ROW_AVX2

 #define HAS_I422TOARGBROW_AVX2

@@ -224,8 +210,10 @@

 #define HAS_MERGEUVROW_AVX2

 #define HAS_MIRRORROW_AVX2

 #define HAS_NV12TOARGBROW_AVX2

+#define HAS_NV12TORGB24ROW_AVX2

 #define HAS_NV12TORGB565ROW_AVX2

 #define HAS_NV21TOARGBROW_AVX2

+#define HAS_NV21TORGB24ROW_AVX2

 #define HAS_SPLITUVROW_AVX2

 #define HAS_UYVYTOARGBROW_AVX2

 #define HAS_UYVYTOUV422ROW_AVX2

@@ -243,11 +231,18 @@

 #define HAS_ARGBSUBTRACTROW_AVX2

 #define HAS_ARGBUNATTENUATEROW_AVX2

 #define HAS_BLENDPLANEROW_AVX2

+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \

+    defined(_MSC_VER)

+// TODO(fbarchard): fix build error on android_full_debug=1

+// https://code.google.com/p/libyuv/issues/detail?id=517

+#define HAS_I422ALPHATOARGBROW_AVX2

 #endif

+#endif

 // The following are available for AVX2 Visual C and clangcl 32 bit:

 // TODO(fbarchard): Port to gcc.

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \

     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))

 #define HAS_ARGB1555TOARGBROW_AVX2

 #define HAS_ARGB4444TOARGBROW_AVX2

@@ -261,9 +256,55 @@

 // The following are also available on x64 Visual C.

 #if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && defined(_M_X64) && \

     (!defined(__clang__) || defined(__SSSE3__))

+#define HAS_I422ALPHATOARGBROW_SSSE3

 #define HAS_I422TOARGBROW_SSSE3

 #endif

+// The following are available for gcc/clang x86 platforms:

+// TODO(fbarchard): Port to Visual C

+#if !defined(LIBYUV_DISABLE_X86) && \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

+#define HAS_ABGRTOAR30ROW_SSSE3

+#define HAS_ARGBTOAR30ROW_SSSE3

+#define HAS_CONVERT16TO8ROW_SSSE3

+#define HAS_CONVERT8TO16ROW_SSE2

+// I210 is for H010.  2 = 422.  I for 601 vs H for 709.

+#define HAS_I210TOAR30ROW_SSSE3

+#define HAS_I210TOARGBROW_SSSE3

+#define HAS_I422TOAR30ROW_SSSE3

+#define HAS_MERGERGBROW_SSSE3

+#define HAS_SPLITRGBROW_SSSE3

+#endif

+// The following are available for AVX2 gcc/clang x86 platforms:

+// TODO(fbarchard): Port to Visual C

+#if !defined(LIBYUV_DISABLE_X86) &&                                       \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \

+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))

+#define HAS_ABGRTOAR30ROW_AVX2

+#define HAS_ARGBTOAR30ROW_AVX2

+#define HAS_ARGBTORAWROW_AVX2

+#define HAS_ARGBTORGB24ROW_AVX2

+#define HAS_CONVERT16TO8ROW_AVX2

+#define HAS_CONVERT8TO16ROW_AVX2

+#define HAS_I210TOAR30ROW_AVX2

+#define HAS_I210TOARGBROW_AVX2

+#define HAS_I422TOAR30ROW_AVX2

+#define HAS_I422TOUYVYROW_AVX2

+#define HAS_I422TOYUY2ROW_AVX2

+#define HAS_MERGEUVROW_16_AVX2

+#define HAS_MULTIPLYROW_16_AVX2

+#endif

+// The following are available for AVX512 clang x86 platforms:

+// TODO(fbarchard): Port to GCC and Visual C

+// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789

+#if !defined(LIBYUV_DISABLE_X86) &&                                       \

+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \

+    (defined(CLANG_HAS_AVX512))

+#define HAS_ARGBTORGB24ROW_AVX512VBMI

+#endif

 // The following are available on Neon platforms:

 #if !defined(LIBYUV_DISABLE_NEON) && \

     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))

@@ -275,6 +316,7 @@

 #define HAS_ARGB4444TOARGBROW_NEON

 #define HAS_ARGB4444TOUVROW_NEON

 #define HAS_ARGB4444TOYROW_NEON

+#define HAS_ARGBEXTRACTALPHAROW_NEON

 #define HAS_ARGBSETROW_NEON

 #define HAS_ARGBTOARGB1555ROW_NEON

 #define HAS_ARGBTOARGB4444ROW_NEON

@@ -282,18 +324,17 @@

 #define HAS_ARGBTORGB24ROW_NEON

 #define HAS_ARGBTORGB565DITHERROW_NEON

 #define HAS_ARGBTORGB565ROW_NEON

-#define HAS_ARGBTOUV411ROW_NEON

 #define HAS_ARGBTOUV444ROW_NEON

 #define HAS_ARGBTOUVJROW_NEON

 #define HAS_ARGBTOUVROW_NEON

 #define HAS_ARGBTOYJROW_NEON

 #define HAS_ARGBTOYROW_NEON

-#define HAS_ARGBEXTRACTALPHAROW_NEON

 #define HAS_BGRATOUVROW_NEON

 #define HAS_BGRATOYROW_NEON

+#define HAS_BYTETOFLOATROW_NEON

 #define HAS_COPYROW_NEON

+#define HAS_HALFFLOATROW_NEON

 #define HAS_I400TOARGBROW_NEON

-#define HAS_I411TOARGBROW_NEON

 #define HAS_I422ALPHATOARGBROW_NEON

 #define HAS_I422TOARGB1555ROW_NEON

 #define HAS_I422TOARGB4444ROW_NEON

@@ -309,8 +350,10 @@

 #define HAS_MIRRORROW_NEON

 #define HAS_MIRRORUVROW_NEON

 #define HAS_NV12TOARGBROW_NEON

+#define HAS_NV12TORGB24ROW_NEON

 #define HAS_NV12TORGB565ROW_NEON

 #define HAS_NV21TOARGBROW_NEON

+#define HAS_NV21TORGB24ROW_NEON

 #define HAS_RAWTOARGBROW_NEON

 #define HAS_RAWTORGB24ROW_NEON

 #define HAS_RAWTOUVROW_NEON

@@ -324,6 +367,7 @@

 #define HAS_RGBATOUVROW_NEON

 #define HAS_RGBATOYROW_NEON

 #define HAS_SETROW_NEON

+#define HAS_SPLITRGBROW_NEON

 #define HAS_SPLITUVROW_NEON

 #define HAS_UYVYTOARGBROW_NEON

 #define HAS_UYVYTOUV422ROW_NEON

@@ -355,17 +399,87 @@

 #define HAS_SOBELYROW_NEON

 #endif

-// The following are available on Mips platforms:

-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \

-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)

-#define HAS_COPYROW_MIPS

-#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#define HAS_I422TOARGBROW_DSPR2

-#define HAS_INTERPOLATEROW_DSPR2

-#define HAS_MIRRORROW_DSPR2

-#define HAS_MIRRORUVROW_DSPR2

-#define HAS_SPLITUVROW_DSPR2

+// The following are available on AArch64 platforms:

+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

+#define HAS_SCALESUMSAMPLES_NEON

 #endif

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#define HAS_ABGRTOUVROW_MSA

+#define HAS_ABGRTOYROW_MSA

+#define HAS_ARGB1555TOARGBROW_MSA

+#define HAS_ARGB1555TOUVROW_MSA

+#define HAS_ARGB1555TOYROW_MSA

+#define HAS_ARGB4444TOARGBROW_MSA

+#define HAS_ARGBADDROW_MSA

+#define HAS_ARGBATTENUATEROW_MSA

+#define HAS_ARGBBLENDROW_MSA

+#define HAS_ARGBCOLORMATRIXROW_MSA

+#define HAS_ARGBEXTRACTALPHAROW_MSA

+#define HAS_ARGBGRAYROW_MSA

+#define HAS_ARGBMIRRORROW_MSA

+#define HAS_ARGBMULTIPLYROW_MSA

+#define HAS_ARGBQUANTIZEROW_MSA

+#define HAS_ARGBSEPIAROW_MSA

+#define HAS_ARGBSETROW_MSA

+#define HAS_ARGBSHADEROW_MSA

+#define HAS_ARGBSHUFFLEROW_MSA

+#define HAS_ARGBSUBTRACTROW_MSA

+#define HAS_ARGBTOARGB1555ROW_MSA

+#define HAS_ARGBTOARGB4444ROW_MSA

+#define HAS_ARGBTORAWROW_MSA

+#define HAS_ARGBTORGB24ROW_MSA

+#define HAS_ARGBTORGB565DITHERROW_MSA

+#define HAS_ARGBTORGB565ROW_MSA

+#define HAS_ARGBTOUV444ROW_MSA

+#define HAS_ARGBTOUVJROW_MSA

+#define HAS_ARGBTOUVROW_MSA

+#define HAS_ARGBTOYJROW_MSA

+#define HAS_ARGBTOYROW_MSA

+#define HAS_BGRATOUVROW_MSA

+#define HAS_BGRATOYROW_MSA

+#define HAS_HALFFLOATROW_MSA

+#define HAS_I400TOARGBROW_MSA

+#define HAS_I422ALPHATOARGBROW_MSA

+#define HAS_I422TOARGBROW_MSA

+#define HAS_I422TORGB24ROW_MSA

+#define HAS_I422TORGBAROW_MSA

+#define HAS_I422TOUYVYROW_MSA

+#define HAS_I422TOYUY2ROW_MSA

+#define HAS_I444TOARGBROW_MSA

+#define HAS_INTERPOLATEROW_MSA

+#define HAS_J400TOARGBROW_MSA

+#define HAS_MERGEUVROW_MSA

+#define HAS_MIRRORROW_MSA

+#define HAS_MIRRORUVROW_MSA

+#define HAS_NV12TOARGBROW_MSA

+#define HAS_NV12TORGB565ROW_MSA

+#define HAS_NV21TOARGBROW_MSA

+#define HAS_RAWTOARGBROW_MSA

+#define HAS_RAWTORGB24ROW_MSA

+#define HAS_RAWTOUVROW_MSA

+#define HAS_RAWTOYROW_MSA

+#define HAS_RGB24TOARGBROW_MSA

+#define HAS_RGB24TOUVROW_MSA

+#define HAS_RGB24TOYROW_MSA

+#define HAS_RGB565TOARGBROW_MSA

+#define HAS_RGB565TOUVROW_MSA

+#define HAS_RGB565TOYROW_MSA

+#define HAS_RGBATOUVROW_MSA

+#define HAS_RGBATOYROW_MSA

+#define HAS_SETROW_MSA

+#define HAS_SOBELROW_MSA

+#define HAS_SOBELTOPLANEROW_MSA

+#define HAS_SOBELXROW_MSA

+#define HAS_SOBELXYROW_MSA

+#define HAS_SOBELYROW_MSA

+#define HAS_SPLITUVROW_MSA

+#define HAS_UYVYTOARGBROW_MSA

+#define HAS_UYVYTOUVROW_MSA

+#define HAS_UYVYTOYROW_MSA

+#define HAS_YUY2TOARGBROW_MSA

+#define HAS_YUY2TOUV422ROW_MSA

+#define HAS_YUY2TOUVROW_MSA

+#define HAS_YUY2TOYROW_MSA

 #endif

 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)

@@ -374,18 +488,18 @@

 #else

 #define SIMD_ALIGNED(var) __declspec(align(16)) var

 #endif

-typedef __declspec(align(16)) int16 vec16[8];

-typedef __declspec(align(16)) int32 vec32[4];

-typedef __declspec(align(16)) int8 vec8[16];

-typedef __declspec(align(16)) uint16 uvec16[8];

-typedef __declspec(align(16)) uint32 uvec32[4];

-typedef __declspec(align(16)) uint8 uvec8[16];

-typedef __declspec(align(32)) int16 lvec16[16];

-typedef __declspec(align(32)) int32 lvec32[8];

-typedef __declspec(align(32)) int8 lvec8[32];

-typedef __declspec(align(32)) uint16 ulvec16[16];

-typedef __declspec(align(32)) uint32 ulvec32[8];

-typedef __declspec(align(32)) uint8 ulvec8[32];

+typedef __declspec(align(16)) int16_t vec16[8];

+typedef __declspec(align(16)) int32_t vec32[4];

+typedef __declspec(align(16)) int8_t vec8[16];

+typedef __declspec(align(16)) uint16_t uvec16[8];

+typedef __declspec(align(16)) uint32_t uvec32[4];

+typedef __declspec(align(16)) uint8_t uvec8[16];

+typedef __declspec(align(32)) int16_t lvec16[16];

+typedef __declspec(align(32)) int32_t lvec32[8];

+typedef __declspec(align(32)) int8_t lvec8[32];

+typedef __declspec(align(32)) uint16_t ulvec16[16];

+typedef __declspec(align(32)) uint32_t ulvec32[8];

+typedef __declspec(align(32)) uint8_t ulvec8[32];

 #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))

 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.

 #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)

@@ -393,32 +507,32 @@

 #else

 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))

 #endif

-typedef int16 __attribute__((vector_size(16))) vec16;

-typedef int32 __attribute__((vector_size(16))) vec32;

-typedef int8 __attribute__((vector_size(16))) vec8;

-typedef uint16 __attribute__((vector_size(16))) uvec16;

-typedef uint32 __attribute__((vector_size(16))) uvec32;

-typedef uint8 __attribute__((vector_size(16))) uvec8;

-typedef int16 __attribute__((vector_size(32))) lvec16;

-typedef int32 __attribute__((vector_size(32))) lvec32;

-typedef int8 __attribute__((vector_size(32))) lvec8;

-typedef uint16 __attribute__((vector_size(32))) ulvec16;

-typedef uint32 __attribute__((vector_size(32))) ulvec32;

-typedef uint8 __attribute__((vector_size(32))) ulvec8;

+typedef int16_t __attribute__((vector_size(16))) vec16;

+typedef int32_t __attribute__((vector_size(16))) vec32;

+typedef int8_t __attribute__((vector_size(16))) vec8;

+typedef uint16_t __attribute__((vector_size(16))) uvec16;

+typedef uint32_t __attribute__((vector_size(16))) uvec32;

+typedef uint8_t __attribute__((vector_size(16))) uvec8;

+typedef int16_t __attribute__((vector_size(32))) lvec16;

+typedef int32_t __attribute__((vector_size(32))) lvec32;

+typedef int8_t __attribute__((vector_size(32))) lvec8;

+typedef uint16_t __attribute__((vector_size(32))) ulvec16;

+typedef uint32_t __attribute__((vector_size(32))) ulvec32;

+typedef uint8_t __attribute__((vector_size(32))) ulvec8;

 #else

 #define SIMD_ALIGNED(var) var

-typedef int16 vec16[8];

-typedef int32 vec32[4];

-typedef int8 vec8[16];

-typedef uint16 uvec16[8];

-typedef uint32 uvec32[4];

-typedef uint8 uvec8[16];

-typedef int16 lvec16[16];

-typedef int32 lvec32[8];

-typedef int8 lvec8[32];

-typedef uint16 ulvec16[16];

-typedef uint32 ulvec32[8];

-typedef uint8 ulvec8[32];

+typedef int16_t vec16[8];

+typedef int32_t vec32[4];

+typedef int8_t vec8[16];

+typedef uint16_t uvec16[8];

+typedef uint32_t uvec32[4];

+typedef uint8_t uvec8[16];

+typedef int16_t lvec16[16];

+typedef int32_t lvec32[8];

+typedef int8_t lvec8[32];

+typedef uint16_t ulvec16[16];

+typedef uint32_t ulvec32[8];

+typedef uint8_t ulvec8[32];

 #endif

 #if defined(__aarch64__)

@@ -442,23 +556,23 @@

 #else

 // This struct is for Intel color conversion.

 struct YuvConstants {

-  int8 kUVToB[32];

-  int8 kUVToG[32];

-  int8 kUVToR[32];

-  int16 kUVBiasB[16];

-  int16 kUVBiasG[16];

-  int16 kUVBiasR[16];

-  int16 kYToRgb[16];

+  int8_t kUVToB[32];

+  int8_t kUVToG[32];

+  int8_t kUVToR[32];

+  int16_t kUVBiasB[16];

+  int16_t kUVBiasG[16];

+  int16_t kUVBiasR[16];

+  int16_t kYToRgb[16];

};

 // Offsets into YuvConstants structure

-#define KUVTOB   0

-#define KUVTOG   32

-#define KUVTOR   64

+#define KUVTOB 0

+#define KUVTOG 32

+#define KUVTOR 64

 #define KUVBIASB 96

 #define KUVBIASG 128

 #define KUVBIASR 160

-#define KYTORGB  192

+#define KYTORGB 192

 #endif

 // Conversion matrix for YUV to RGB

@@ -471,6 +585,16 @@

 extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants);  // JPeg

 extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709

+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))

+#define align_buffer_64(var, size)                                           \

+  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \

+  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */

+#define free_aligned_buffer_64(var) \

+  free(var##_mem);                  \

+  var = 0

 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)

 #define OMITFP

 #else

@@ -483,1458 +607,2863 @@

 #else

 #define LABELALIGN

 #endif

-#if defined(__native_client__) && defined(__x86_64__)

-// r14 is used for MEMOP macros.

-#define NACL_R14 "r14",

-#define BUNDLELOCK ".bundle_lock\n"

-#define BUNDLEUNLOCK ".bundle_unlock\n"

-#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"

-#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"

-#define MEMLEA(offset, base) #offset "(%q" #base ")"

-#define MEMLEA3(offset, index, scale) \

-    #offset "(,%q" #index "," #scale ")"

-#define MEMLEA4(offset, base, index, scale) \

-    #offset "(%q" #base ",%q" #index "," #scale ")"

-#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"

-#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"

-#define MEMOPREG(opcode, offset, base, index, scale, reg) \

-    BUNDLELOCK \

-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

-    #opcode " (%%r15,%%r14),%%" #reg "\n" \

-    BUNDLEUNLOCK

-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \

-    BUNDLELOCK \

-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

-    #opcode " %%" #reg ",(%%r15,%%r14)\n" \

-    BUNDLEUNLOCK

-#define MEMOPARG(opcode, offset, base, index, scale, arg) \

-    BUNDLELOCK \

-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

-    #opcode " (%%r15,%%r14),%" #arg "\n" \

-    BUNDLEUNLOCK

-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \

-    BUNDLELOCK \

-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

-    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \

-    BUNDLEUNLOCK

-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \

-    BUNDLELOCK \

-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \

-    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \

-    BUNDLEUNLOCK

-#else  // defined(__native_client__) && defined(__x86_64__)

-#define NACL_R14

-#define BUNDLEALIGN

-#define MEMACCESS(base) "(%" #base ")"

-#define MEMACCESS2(offset, base) #offset "(%" #base ")"

-#define MEMLEA(offset, base) #offset "(%" #base ")"

-#define MEMLEA3(offset, index, scale) \

-    #offset "(,%" #index "," #scale ")"

-#define MEMLEA4(offset, base, index, scale) \

-    #offset "(%" #base ",%" #index "," #scale ")"

-#define MEMMOVESTRING(s, d)

-#define MEMSTORESTRING(reg, d)

-#define MEMOPREG(opcode, offset, base, index, scale, reg) \

-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"

-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \

-    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"

-#define MEMOPARG(opcode, offset, base, index, scale, arg) \

-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"

-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \

-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \

-    #reg2 "\n"

-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \

-    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"

-#endif  // defined(__native_client__) && defined(__x86_64__)

-#if defined(__arm__) || defined(__aarch64__)

-#undef MEMACCESS

-#if defined(__native_client__)

-#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"

-#else

-#define MEMACCESS(base)

+// Intel Code Analizer markers.  Insert IACA_START IACA_END around code to be

+// measured and then run with iaca -64 libyuv_unittest.

+// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within

+// inline assembly blocks.

+// example of iaca:

+// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest

+#if defined(__x86_64__) || defined(__i386__)

+#define IACA_ASM_START  \

+  ".byte 0x0F, 0x0B\n"  \

+  " movl $111, %%ebx\n" \

+  ".byte 0x64, 0x67, 0x90\n"

+#define IACA_ASM_END         \

+  " movl $222, %%ebx\n"      \

+  ".byte 0x64, 0x67, 0x90\n" \

+  ".byte 0x0F, 0x0B\n"

+#define IACA_SSC_MARK(MARK_ID)                        \

+  __asm__ __volatile__("\n\t  movl $" #MARK_ID        \

+                       ", %%ebx"                      \

+                       "\n\t  .byte 0x64, 0x67, 0x90" \

+                       :                              \

+                       :                              \

+                       : "memory");

+#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");

+#else /* Visual C */

+#define IACA_UD_BYTES \

+  { __asm _emit 0x0F __asm _emit 0x0B }

+#define IACA_SSC_MARK(x) \

+  { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }

+#define IACA_VC64_START __writegsbyte(111, 111);

+#define IACA_VC64_END __writegsbyte(222, 222);

 #endif

-#endif

-void I444ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+#define IACA_START     \

+  {                    \

+    IACA_UD_BYTES      \

+    IACA_SSC_MARK(111) \

+  }

+#define IACA_END       \

+  {                    \

+    IACA_SSC_MARK(222) \

+    IACA_UD_BYTES      \

+  }

+void I444ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I422AlphaToARGBRow_NEON(const uint8* y_buf,

-                             const uint8* u_buf,

-                             const uint8* v_buf,

-                             const uint8* a_buf,

-                             uint8* dst_argb,

+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,

+                             const uint8_t* src_u,

+                             const uint8_t* src_v,

+                             const uint8_t* src_a,

+                             uint8_t* dst_argb,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I422ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I411ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToRGBARow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_rgba,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToRGBARow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_rgba,

-                        const struct YuvConstants* yuvconstants,

-                        int width);

-void I422ToRGB24Row_NEON(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_rgb24,

+void I422ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_rgb24,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I422ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_u,

-                          const uint8* src_v,

-                          uint8* dst_rgb565,

+void I422ToRGB565Row_NEON(const uint8_t* src_y,

+                          const uint8_t* src_u,

+                          const uint8_t* src_v,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width);

-void I422ToARGB1555Row_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb1555,

+void I422ToARGB1555Row_NEON(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb1555,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToARGB4444Row_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb4444,

+void I422ToARGB4444Row_NEON(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb4444,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void NV12ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_uv,

-                        uint8* dst_argb,

+void NV12ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_uv,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void NV12ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_uv,

-                          uint8* dst_rgb565,

+void NV12ToRGB565Row_NEON(const uint8_t* src_y,

+                          const uint8_t* src_uv,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width);

-void NV21ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_vu,

-                        uint8* dst_argb,

+void NV21ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_vu,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

-                        uint8* dst_argb,

+void NV12ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_uv,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void NV21ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_vu,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void UYVYToARGBRow_NEON(const uint8* src_uyvy,

-                        uint8* dst_argb,

+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

+void I444ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width);

-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);

-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);

-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);

-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);

-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);

-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+void I422ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width);

+void I422ToRGBARow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width);

+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            const uint8_t* src_a,

+                            uint8_t* dst_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width);

+void I422ToRGB24Row_MSA(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

+                        const struct YuvConstants* yuvconstants,

+                        int width);

+void I422ToRGB565Row_MSA(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_rgb565,

+                         const struct YuvConstants* yuvconstants,

                          int width);

-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+void I422ToARGB4444Row_MSA(const uint8_t* src_y,

+                           const uint8_t* src_u,

+                           const uint8_t* src_v,

+                           uint8_t* dst_argb4444,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void I422ToARGB1555Row_MSA(const uint8_t* src_y,

+                           const uint8_t* src_u,

+                           const uint8_t* src_v,

+                           uint8_t* dst_argb1555,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void NV12ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_uv,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width);

+void NV12ToRGB565Row_MSA(const uint8_t* src_y,

+                         const uint8_t* src_uv,

+                         uint8_t* dst_rgb565,

+                         const struct YuvConstants* yuvconstants,

                          int width);

-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width);

-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

-                      uint8* dst_u, uint8* dst_v, int width);

-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

-                      uint8* dst_u, uint8* dst_v, int width);

-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

-                      uint8* dst_u, uint8* dst_v, int width);

-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                       uint8* dst_u, uint8* dst_v, int width);

-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

-                     uint8* dst_u, uint8* dst_v, int width);

-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                        uint8* dst_u, uint8* dst_v, int width);

-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

-                          uint8* dst_u, uint8* dst_v, int width);

-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

-                          uint8* dst_u, uint8* dst_v, int width);

-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);

-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);

-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);

-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);

-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);

-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);

-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);

-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);

-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);

-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);

-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);

-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);

-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);

-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);

-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);

-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);

-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);

-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);

-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);

-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);

-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);

-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);

-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);

-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);

-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);

-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);

-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);

-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);

-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);

-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);

-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);

-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,

+void NV21ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_vu,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width);

+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width);

+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width);

+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);

+void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);

+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);

+void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);

+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);

+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);

+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);

+void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);

+void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);

+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);

+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);

+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void ARGBToUV444Row_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void ARGBToUVRow_NEON(const uint8_t* src_argb,

+                      int src_stride_argb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void ARGBToUV444Row_MSA(const uint8_t* src_argb,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width);

+void ARGBToUVRow_MSA(const uint8_t* src_argb0,

+                     int src_stride_argb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void ARGBToUVJRow_NEON(const uint8_t* src_argb,

+                       int src_stride_argb,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void BGRAToUVRow_NEON(const uint8_t* src_bgra,

+                      int src_stride_bgra,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void ABGRToUVRow_NEON(const uint8_t* src_abgr,

+                      int src_stride_abgr,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void RGBAToUVRow_NEON(const uint8_t* src_rgba,

+                      int src_stride_rgba,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,

+                       int src_stride_rgb24,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void RAWToUVRow_NEON(const uint8_t* src_raw,

+                     int src_stride_raw,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,

+                        int src_stride_rgb565,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width);

+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,

+                          int src_stride_argb1555,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,

+                          int src_stride_argb4444,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,

+                      int src_stride_rgb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,

+                     int src_stride_rgb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,

+                     int src_stride_rgb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,

+                     int src_stride_rgb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,

+                      int src_stride_rgb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void RAWToUVRow_MSA(const uint8_t* src_rgb0,

+                    int src_stride_rgb,

+                    uint8_t* dst_u,

+                    uint8_t* dst_v,

+                    int width);

+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,

+                       int src_stride_rgb565,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,

+                         int src_stride_argb1555,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);

+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);

+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);

+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);

+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);

+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);

+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,

+                         uint8_t* dst_y,

+                         int width);

+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,

+                         uint8_t* dst_y,

+                         int width);

+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);

+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);

+void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);

+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);

+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);

+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);

+void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);

+void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);

+void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

                              int width);

-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,

+void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

                              int width);

+void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

-void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,

-                        uint8* dst_u, uint8* dst_v, int width);

-void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,

-                       uint8* dst_u, uint8* dst_v, int width);

-void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,

-                       uint8* dst_u, uint8* dst_v, int width);

-void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,

-                       uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,

-                          uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,

-                           uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,

-                           uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,

-                            uint8* dst_u, uint8* dst_v, int width);

-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,

-                           uint8* dst_u, uint8* dst_v, int width);

-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,

-                           uint8* dst_u, uint8* dst_v, int width);

-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,

-                           uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,

+                      int src_stride_argb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,

+                       int src_stride_argb,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,

+                       int src_stride_argb,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,

+                        int src_stride_argb,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width);

+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,

+                       int src_stride_bgra,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,

+                       int src_stride_abgr,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,

+                       int src_stride_rgba,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,

+                            int src_stride_ptr,

+                            uint8_t* dst_u,

+                            uint8_t* dst_v,

+                            int width);

+void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

                              int width);

-void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,

+                            uint8_t* dst_u,

+                            uint8_t* dst_v,

+                            int width);

+void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,

+                         int src_stride_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,

+                         int src_stride_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,

+                            int src_stride_ptr,

+                            uint8_t* dst_u,

+                            uint8_t* dst_v,

+                            int width);

+void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,

+                              int src_stride_ptr,

+                              uint8_t* dst_u,

+                              uint8_t* dst_v,

+                              int width);

+void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,

+                              int src_stride_ptr,

+                              uint8_t* dst_u,

+                              uint8_t* dst_v,

+                              int width);

+void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr,

+                         int src_stride_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr,

+                         int src_stride_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr,

+                         int src_stride_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void RAWToUVRow_Any_MSA(const uint8_t* src_ptr,

+                        int src_stride_ptr,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width);

+void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr,

+                           int src_stride_ptr,

+                           uint8_t* dst_u,

+                           uint8_t* dst_v,

+                           int width);

+void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,

+                             int src_stride_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

                              int width);

-void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,

-                          uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,

-                           uint8* dst_u, uint8* dst_v, int width);

-void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,

-                          uint8* dst_u, uint8* dst_v, int width);

-void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,

-                          uint8* dst_u, uint8* dst_v, int width);

-void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,

-                          uint8* dst_u, uint8* dst_v, int width);

-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                           uint8* dst_u, uint8* dst_v, int width);

-void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,

-                         uint8* dst_u, uint8* dst_v, int width);

-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                            uint8* dst_u, uint8* dst_v, int width);

-void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,

-                              int src_stride_argb1555,

-                              uint8* dst_u, uint8* dst_v, int width);

-void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,

-                              int src_stride_argb4444,

-                              uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,

-                   uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,

-                    uint8* dst_u, uint8* dst_v, int width);

-void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,

-                   uint8* dst_u, uint8* dst_v, int width);

-void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,

-                   uint8* dst_u, uint8* dst_v, int width);

-void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,

-                   uint8* dst_u, uint8* dst_v, int width);

-void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,

-                    uint8* dst_u, uint8* dst_v, int width);

-void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,

-                  uint8* dst_u, uint8* dst_v, int width);

-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,

-                     uint8* dst_u, uint8* dst_v, int width);

-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,

-                       uint8* dst_u, uint8* dst_v, int width);

-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,

-                       uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUVRow_C(const uint8_t* src_rgb0,

+                   int src_stride_rgb,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

+void ARGBToUVJRow_C(const uint8_t* src_rgb0,

+                    int src_stride_rgb,

+                    uint8_t* dst_u,

+                    uint8_t* dst_v,

+                    int width);

+void ARGBToUVRow_C(const uint8_t* src_rgb0,

+                   int src_stride_rgb,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

+void ARGBToUVJRow_C(const uint8_t* src_rgb0,

+                    int src_stride_rgb,

+                    uint8_t* dst_u,

+                    uint8_t* dst_v,

+                    int width);

+void BGRAToUVRow_C(const uint8_t* src_rgb0,

+                   int src_stride_rgb,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

+void ABGRToUVRow_C(const uint8_t* src_rgb0,

+                   int src_stride_rgb,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

+void RGBAToUVRow_C(const uint8_t* src_rgb0,

+                   int src_stride_rgb,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

+void RGB24ToUVRow_C(const uint8_t* src_rgb0,

+                    int src_stride_rgb,

+                    uint8_t* dst_u,

+                    uint8_t* dst_v,

+                    int width);

+void RAWToUVRow_C(const uint8_t* src_rgb0,

+                  int src_stride_rgb,

+                  uint8_t* dst_u,

+                  uint8_t* dst_v,

+                  int width);

+void RGB565ToUVRow_C(const uint8_t* src_rgb565,

+                     int src_stride_rgb565,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,

+                       int src_stride_argb1555,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,

+                       int src_stride_argb4444,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width);

-void ARGBToUV444Row_SSSE3(const uint8* src_argb,

-                          uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,

-                              uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,

+                              uint8_t* dst_u,

+                              uint8_t* dst_v,

+                              int width);

-void ARGBToUV444Row_C(const uint8* src_argb,

-                      uint8* dst_u, uint8* dst_v, int width);

-void ARGBToUV411Row_C(const uint8* src_argb,

-                      uint8* dst_u, uint8* dst_v, int width);

+void ARGBToUV444Row_C(const uint8_t* src_argb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);

-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);

-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);

-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);

-void MirrorRow_C(const uint8* src, uint8* dst, int width);

-void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);

-void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);

-void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);

-void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);

+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);

+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);

+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);

+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);

+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);

+void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

-void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void MirrorUVRow_SSSE3(const uint8_t* src,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

                        int width);

-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void MirrorUVRow_NEON(const uint8_t* src_uv,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

                       int width);

-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                       int width);

-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);

+void MirrorUVRow_MSA(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void MirrorUVRow_C(const uint8_t* src_uv,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);

-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);

-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);

-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);

-void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);

-void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);

-void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);

+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);

+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);

+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);

+void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);

-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_C(const uint8_t* src_uv,

+                  uint8_t* dst_u,

+                  uint8_t* dst_v,

+                  int width);

+void SplitUVRow_SSE2(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

                      int width);

-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_AVX2(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

                      int width);

-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_NEON(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

                      int width);

-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                      int width);

-void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_MSA(const uint8_t* src_uv,

+                    uint8_t* dst_u,

+                    uint8_t* dst_v,

+                    int width);

+void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width);

-void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_Any_AVX2(const uint8_t* src_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width);

-void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_Any_NEON(const uint8_t* src_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width);

-void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                          int width);

+void SplitUVRow_Any_MSA(const uint8_t* src_ptr,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width);

-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_C(const uint8_t* src_u,

+                  const uint8_t* src_v,

+                  uint8_t* dst_uv,

                   int width);

-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_SSE2(const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_uv,

                      int width);

-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_AVX2(const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_uv,

                      int width);

-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_NEON(const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_uv,

                      int width);

-void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_MSA(const uint8_t* src_u,

+                    const uint8_t* src_v,

+                    uint8_t* dst_uv,

+                    int width);

+void MergeUVRow_Any_SSE2(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

                          int width);

-void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_Any_AVX2(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

                          int width);

-void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_Any_NEON(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

                          int width);

+void MergeUVRow_Any_MSA(const uint8_t* y_buf,

+                        const uint8_t* uv_buf,

+                        uint8_t* dst_ptr,

+                        int width);

-void CopyRow_SSE2(const uint8* src, uint8* dst, int count);

-void CopyRow_AVX(const uint8* src, uint8* dst, int count);

-void CopyRow_ERMS(const uint8* src, uint8* dst, int count);

-void CopyRow_NEON(const uint8* src, uint8* dst, int count);

-void CopyRow_MIPS(const uint8* src, uint8* dst, int count);

-void CopyRow_C(const uint8* src, uint8* dst, int count);

-void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);

-void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);

-void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);

+void SplitRGBRow_C(const uint8_t* src_rgb,

+                   uint8_t* dst_r,

+                   uint8_t* dst_g,

+                   uint8_t* dst_b,

+                   int width);

+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,

+                       uint8_t* dst_r,

+                       uint8_t* dst_g,

+                       uint8_t* dst_b,

+                       int width);

+void SplitRGBRow_NEON(const uint8_t* src_rgb,

+                      uint8_t* dst_r,

+                      uint8_t* dst_g,

+                      uint8_t* dst_b,

+                      int width);

+void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,

+                           uint8_t* dst_r,

+                           uint8_t* dst_g,

+                           uint8_t* dst_b,

+                           int width);

+void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,

+                          uint8_t* dst_r,

+                          uint8_t* dst_g,

+                          uint8_t* dst_b,

+                          int width);

-void CopyRow_16_C(const uint16* src, uint16* dst, int count);

+void MergeRGBRow_C(const uint8_t* src_r,

+                   const uint8_t* src_g,

+                   const uint8_t* src_b,

+                   uint8_t* dst_rgb,

+                   int width);

+void MergeRGBRow_SSSE3(const uint8_t* src_r,

+                       const uint8_t* src_g,

+                       const uint8_t* src_b,

+                       uint8_t* dst_rgb,

+                       int width);

+void MergeRGBRow_NEON(const uint8_t* src_r,

+                      const uint8_t* src_g,

+                      const uint8_t* src_b,

+                      uint8_t* dst_rgb,

+                      int width);

+void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,

+                           const uint8_t* u_buf,

+                           const uint8_t* v_buf,

+                           uint8_t* dst_ptr,

+                           int width);

+void MergeRGBRow_Any_NEON(const uint8_t* src_r,

+                          const uint8_t* src_g,

+                          const uint8_t* src_b,

+                          uint8_t* dst_rgb,

+                          int width);

-void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

+void MergeUVRow_16_C(const uint16_t* src_u,

+                     const uint16_t* src_v,

+                     uint16_t* dst_uv,

+                     int scale, /* 64 for 10 bit */

+                     int width);

+void MergeUVRow_16_AVX2(const uint16_t* src_u,

+                        const uint16_t* src_v,

+                        uint16_t* dst_uv,

+                        int scale,

+                        int width);

+void MultiplyRow_16_AVX2(const uint16_t* src_y,

+                         uint16_t* dst_y,

+                         int scale,

+                         int width);

+void MultiplyRow_16_C(const uint16_t* src_y,

+                      uint16_t* dst_y,

+                      int scale,

+                      int width);

+void Convert8To16Row_C(const uint8_t* src_y,

+                       uint16_t* dst_y,

+                       int scale,

+                       int width);

+void Convert8To16Row_SSE2(const uint8_t* src_y,

+                          uint16_t* dst_y,

+                          int scale,

+                          int width);

+void Convert8To16Row_AVX2(const uint8_t* src_y,

+                          uint16_t* dst_y,

+                          int scale,

+                          int width);

+void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr,

+                              uint16_t* dst_ptr,

+                              int scale,

+                              int width);

+void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,

+                              uint16_t* dst_ptr,

+                              int scale,

+                              int width);

+void Convert16To8Row_C(const uint16_t* src_y,

+                       uint8_t* dst_y,

+                       int scale,

+                       int width);

+void Convert16To8Row_SSSE3(const uint16_t* src_y,

+                           uint8_t* dst_y,

+                           int scale,

+                           int width);

+void Convert16To8Row_AVX2(const uint16_t* src_y,

+                          uint8_t* dst_y,

+                          int scale,

+                          int width);

+void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,

+                               uint8_t* dst_ptr,

+                               int scale,

                                int width);

-void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

+void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,

+                              uint8_t* dst_ptr,

+                              int scale,

+                              int width);

+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);

+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);

+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);

+void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);

+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);

+void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);

+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);

+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

                                int width);

+void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

+                               int width);

-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);

-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);

-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);

-void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a,

+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);

+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,

+                              uint8_t* dst_a,

+                              int width);

+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,

+                              uint8_t* dst_a,

+                              int width);

+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,

+                              uint8_t* dst_a,

+                              int width);

+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,

+                             uint8_t* dst_a,

+                             int width);

+void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,

+                                  uint8_t* dst_ptr,

                                   int width);

-void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a,

+void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr,

+                                  uint8_t* dst_ptr,

                                   int width);

+void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,

+                                  uint8_t* dst_ptr,

+                                  int width);

+void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,

+                                 uint8_t* dst_ptr,

+                                 int width);

-void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);

-void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);

-void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);

-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,

+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);

+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,

+                                  uint8_t* dst_ptr,

                                   int width);

-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,

+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,

+                                  uint8_t* dst_ptr,

                                   int width);

-void SetRow_C(uint8* dst, uint8 v8, int count);

-void SetRow_X86(uint8* dst, uint8 v8, int count);

-void SetRow_ERMS(uint8* dst, uint8 v8, int count);

-void SetRow_NEON(uint8* dst, uint8 v8, int count);

-void SetRow_Any_X86(uint8* dst, uint8 v8, int count);

-void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);

+void SetRow_C(uint8_t* dst, uint8_t v8, int width);

+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);

+void SetRow_X86(uint8_t* dst, uint8_t v8, int width);

+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);

+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);

+void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);

+void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);

-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);

-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);

-void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);

-void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);

+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);

+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);

+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);

+void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);

+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);

+void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);

 // ARGBShufflers for BGRAToARGB etc.

-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,

-                      const uint8* shuffler, int width);

-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width);

-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                          const uint8* shuffler, int width);

-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width);

-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width);

-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

-                             const uint8* shuffler, int width);

-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                              const uint8* shuffler, int width);

-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

-                             const uint8* shuffler, int width);

-void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,

-                             const uint8* shuffler, int width);

+void ARGBShuffleRow_C(const uint8_t* src_argb,

+                      uint8_t* dst_argb,

+                      const uint8_t* shuffler,

+                      int width);

+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          const uint8_t* shuffler,

+                          int width);

+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,

+                         uint8_t* dst_argb,

+                         const uint8_t* shuffler,

+                         int width);

+void ARGBShuffleRow_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_argb,

+                         const uint8_t* shuffler,

+                         int width);

+void ARGBShuffleRow_MSA(const uint8_t* src_argb,

+                        uint8_t* dst_argb,

+                        const uint8_t* shuffler,

+                        int width);

+void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

+                              const uint8_t* param,

+                              int width);

+void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             const uint8_t* param,

+                             int width);

+void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             const uint8_t* param,

+                             int width);

+void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            const uint8_t* param,

+                            int width);

-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);

-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);

-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);

-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);

-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,

+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,

+                          uint8_t* dst_argb,

+                          int width);

+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);

+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);

+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,

+                            uint8_t* dst_argb,

                             int width);

-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,

+                            uint8_t* dst_argb,

                             int width);

-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);

-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,

-                            int width);

-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,

-                            int width);

-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);

-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);

-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);

-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);

-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,

+                         uint8_t* dst_argb,

+                         int width);

+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);

+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);

+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);

+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);

+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);

+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,

+                          uint8_t* dst_argb,

+                          int width);

+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,

+                         uint8_t* dst_argb,

+                         int width);

+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,

+                            uint8_t* dst_argb,

                             int width);

-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,

+                           uint8_t* dst_argb,

+                           int width);

+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,

+                            uint8_t* dst_argb,

                             int width);

-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);

-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);

-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);

-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);

-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);

-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,

+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,

+                           uint8_t* dst_argb,

+                           int width);

+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);

+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);

+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);

+void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);

+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,

+                         uint8_t* dst_argb,

+                         int width);

+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,

+                         uint8_t* dst_argb,

+                         int width);

+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);

+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width);

+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);

+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width);

+void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

                               int width);

-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);

-void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);

+void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             int width);

-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,

+void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

                               int width);

-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,

+void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,

+void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

                               int width);

-void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,

+void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,

+void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

                              int width);

-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);

-void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);

-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,

+void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

                               int width);

-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,

+void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             int width);

+void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

+                               int width);

+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

+                               int width);

-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,

-                             const uint32 dither4, int width);

-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int width);

-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int width);

+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);

+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);

+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);

+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);

+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);

-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);

-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int width);

+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);

-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);

+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,

+                             uint8_t* dst_rgb,

+                             const uint32_t dither4,

+                             int width);

+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,

+                                uint8_t* dst,

+                                const uint32_t dither4,

+                                int width);

+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,

+                                uint8_t* dst,

+                                const uint32_t dither4,

+                                int width);

-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);

-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);

-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);

-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);

-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);

-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);

-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);

+void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,

+                            uint8_t* dst_rgb,

+                            int width);

+void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,

+                            uint8_t* dst_rgb,

+                            int width);

+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);

+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);

-void I444ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_argb,

+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_rgb24,

+                         int width);

+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);

+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,

+                          uint8_t* dst_rgb565,

+                          int width);

+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,

+                            uint8_t* dst_argb1555,

+                            int width);

+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,

+                            uint8_t* dst_argb4444,

+                            int width);

+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,

+                                uint8_t* dst_rgb,

+                                const uint32_t dither4,

+                                int width);

+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,

+                           uint8_t* dst_rgb,

+                           int width);

+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,

+                           uint8_t* dst_rgb,

+                           int width);

+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,

+                               uint8_t* dst_rgb,

+                               const uint32_t dither4,

+                               int width);

+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);

+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);

+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);

+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);

+void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);

+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);

+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);

+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);

+void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void I444ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void I422ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_argb,

+void I422ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void I422ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_argb,

+void I422ToAR30Row_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void I422AlphaToARGBRow_C(const uint8* y_buf,

-                          const uint8* u_buf,

-                          const uint8* v_buf,

-                          const uint8* a_buf,

-                          uint8* dst_argb,

-                          const struct YuvConstants* yuvconstants,

-                          int width);

-void I411ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_argb,

+void I210ToAR30Row_C(const uint16_t* src_y,

+                     const uint16_t* src_u,

+                     const uint16_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void NV12ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_uv,

-                     uint8* dst_argb,

+void I210ToARGBRow_C(const uint16_t* src_y,

+                     const uint16_t* src_u,

+                     const uint16_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void NV12ToRGB565Row_C(const uint8* src_y,

-                       const uint8* src_uv,

-                       uint8* dst_argb,

+void I422AlphaToARGBRow_C(const uint8_t* src_y,

+                          const uint8_t* src_u,

+                          const uint8_t* src_v,

+                          const uint8_t* src_a,

+                          uint8_t* rgb_buf,

+                          const struct YuvConstants* yuvconstants,

+                          int width);

+void NV12ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_uv,

+                     uint8_t* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

+                     int width);

+void NV12ToRGB565Row_C(const uint8_t* src_y,

+                       const uint8_t* src_uv,

+                       uint8_t* dst_rgb565,

                        const struct YuvConstants* yuvconstants,

                        int width);

-void NV21ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_uv,

-                     uint8* dst_argb,

+void NV21ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_vu,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void YUY2ToARGBRow_C(const uint8* src_yuy2,

-                     uint8* dst_argb,

+void NV12ToRGB24Row_C(const uint8_t* src_y,

+                      const uint8_t* src_uv,

+                      uint8_t* rgb_buf,

+                      const struct YuvConstants* yuvconstants,

+                      int width);

+void NV21ToRGB24Row_C(const uint8_t* src_y,

+                      const uint8_t* src_vu,

+                      uint8_t* rgb_buf,

+                      const struct YuvConstants* yuvconstants,

+                      int width);

+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void UYVYToARGBRow_C(const uint8* src_uyvy,

-                     uint8* dst_argb,

+void UYVYToARGBRow_C(const uint8_t* src_uyvy,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void I422ToRGBARow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_rgba,

+void I422ToRGBARow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width);

-void I422ToRGB24Row_C(const uint8* src_y,

-                      const uint8* src_u,

-                      const uint8* src_v,

-                      uint8* dst_rgb24,

+void I422ToRGB24Row_C(const uint8_t* src_y,

+                      const uint8_t* src_u,

+                      const uint8_t* src_v,

+                      uint8_t* rgb_buf,

                       const struct YuvConstants* yuvconstants,

                       int width);

-void I422ToARGB4444Row_C(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb4444,

+void I422ToARGB4444Row_C(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_argb4444,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I422ToARGB1555Row_C(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb4444,

+void I422ToARGB1555Row_C(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_argb1555,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I422ToRGB565Row_C(const uint8* src_y,

-                       const uint8* src_u,

-                       const uint8* src_v,

-                       uint8* dst_rgb565,

+void I422ToRGB565Row_C(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_rgb565,

                        const struct YuvConstants* yuvconstants,

                        int width);

-void I422ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToARGBRow_AVX2(const uint8_t* y_buf,

+                        const uint8_t* u_buf,

+                        const uint8_t* v_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToRGBARow_AVX2(const uint8_t* y_buf,

+                        const uint8_t* u_buf,

+                        const uint8_t* v_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToRGBARow_AVX2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width);

-void I444ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* u_buf,

+                         const uint8_t* v_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I444ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I444ToARGBRow_AVX2(const uint8_t* y_buf,

+                        const uint8_t* u_buf,

+                        const uint8_t* v_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I444ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* u_buf,

+                         const uint8_t* v_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I444ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I444ToARGBRow_AVX2(const uint8_t* y_buf,

+                        const uint8_t* u_buf,

+                        const uint8_t* v_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* u_buf,

+                         const uint8_t* v_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,

-                              const uint8* u_buf,

-                              const uint8* v_buf,

-                              const uint8* a_buf,

-                              uint8* dst_argb,

-                              const struct YuvConstants* yuvconstants,

-                              int width);

-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,

-                             const uint8* u_buf,

-                             const uint8* v_buf,

-                             const uint8* a_buf,

-                             uint8* dst_argb,

-                             const struct YuvConstants* yuvconstants,

-                             int width);

-void I422ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

+void I422ToAR30Row_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* u_buf,

+                         const uint8_t* v_buf,

+                         uint8_t* dst_ar30,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I411ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

+void I210ToAR30Row_SSSE3(const uint16_t* y_buf,

+                         const uint16_t* u_buf,

+                         const uint16_t* v_buf,

+                         uint8_t* dst_ar30,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I411ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I210ToARGBRow_SSSE3(const uint16_t* y_buf,

+                         const uint16_t* u_buf,

+                         const uint16_t* v_buf,

+                         uint8_t* dst_argb,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void I422ToAR30Row_AVX2(const uint8_t* y_buf,

+                        const uint8_t* u_buf,

+                        const uint8_t* v_buf,

+                        uint8_t* dst_ar30,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void NV12ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_uv,

-                         uint8* dst_argb,

+void I210ToARGBRow_AVX2(const uint16_t* y_buf,

+                        const uint16_t* u_buf,

+                        const uint16_t* v_buf,

+                        uint8_t* dst_argb,

+                        const struct YuvConstants* yuvconstants,

+                        int width);

+void I210ToAR30Row_AVX2(const uint16_t* y_buf,

+                        const uint16_t* u_buf,

+                        const uint16_t* v_buf,

+                        uint8_t* dst_ar30,

+                        const struct YuvConstants* yuvconstants,

+                        int width);

+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,

+                              const uint8_t* u_buf,

+                              const uint8_t* v_buf,

+                              const uint8_t* a_buf,

+                              uint8_t* dst_argb,

+                              const struct YuvConstants* yuvconstants,

+                              int width);

+void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             const uint8_t* a_buf,

+                             uint8_t* dst_argb,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void NV12ToARGBRow_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void NV12ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_uv,

-                        uint8* dst_argb,

+void NV12ToARGBRow_AVX2(const uint8_t* y_buf,

+                        const uint8_t* uv_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void NV12ToRGB565Row_SSSE3(const uint8* src_y,

-                           const uint8* src_uv,

-                           uint8* dst_argb,

+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,

+                          const uint8_t* src_uv,

+                          uint8_t* dst_rgb24,

+                          const struct YuvConstants* yuvconstants,

+                          int width);

+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,

+                          const uint8_t* src_vu,

+                          uint8_t* dst_rgb24,

+                          const struct YuvConstants* yuvconstants,

+                          int width);

+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,

+                           const uint8_t* src_uv,

+                           uint8_t* dst_rgb565,

                            const struct YuvConstants* yuvconstants,

                            int width);

-void NV12ToRGB565Row_AVX2(const uint8* src_y,

-                          const uint8* src_uv,

-                          uint8* dst_argb,

+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,

+                         const uint8_t* src_uv,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,

+                         const uint8_t* src_vu,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width);

+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,

+                          const uint8_t* src_uv,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width);

-void NV21ToARGBRow_SSSE3(const uint8* src_y,

-                         const uint8* src_uv,

-                         uint8* dst_argb,

+void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* vu_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void NV21ToARGBRow_AVX2(const uint8* src_y,

-                        const uint8* src_uv,

-                        uint8* dst_argb,

+void NV21ToARGBRow_AVX2(const uint8_t* y_buf,

+                        const uint8_t* vu_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,

-                         uint8* dst_argb,

+void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,

-                         uint8* dst_argb,

+void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,

-                        uint8* dst_argb,

+void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,

-                        uint8* dst_argb,

+void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width);

-void I422ToRGBARow_SSSE3(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_rgba,

+void I422ToRGBARow_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* u_buf,

+                         const uint8_t* v_buf,

+                         uint8_t* dst_rgba,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I422ToARGB4444Row_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,

+                             const uint8_t* src_u,

+                             const uint8_t* src_v,

+                             uint8_t* dst_argb4444,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I422ToARGB4444Row_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb4444,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToARGB1555Row_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,

+                             const uint8_t* src_u,

+                             const uint8_t* src_v,

+                             uint8_t* dst_argb1555,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I422ToARGB1555Row_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb1555,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToRGB565Row_SSSE3(const uint8* src_y,

-                           const uint8* src_u,

-                           const uint8* src_v,

-                           uint8* dst_argb,

+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,

+                           const uint8_t* src_u,

+                           const uint8_t* src_v,

+                           uint8_t* dst_rgb565,

                            const struct YuvConstants* yuvconstants,

                            int width);

-void I422ToRGB565Row_AVX2(const uint8* src_y,

-                          const uint8* src_u,

-                          const uint8* src_v,

-                          uint8* dst_argb,

+void I422ToRGB565Row_AVX2(const uint8_t* src_y,

+                          const uint8_t* src_u,

+                          const uint8_t* src_v,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width);

-void I422ToRGB24Row_SSSE3(const uint8* src_y,

-                          const uint8* src_u,

-                          const uint8* src_v,

-                          uint8* dst_rgb24,

+void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,

+                          const uint8_t* u_buf,

+                          const uint8_t* v_buf,

+                          uint8_t* dst_rgb24,

                           const struct YuvConstants* yuvconstants,

                           int width);

-void I422ToRGB24Row_AVX2(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_rgb24,

+void I422ToRGB24Row_AVX2(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_rgb24,

                          const struct YuvConstants* yuvconstants,

                          int width);

-void I422ToARGBRow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToRGBARow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I444ToARGBRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

+void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I444ToARGBRow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToARGBRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

+void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,

-                                  const uint8* u_buf,

-                                  const uint8* v_buf,

-                                  const uint8* a_buf,

-                                  uint8* dst_argb,

+void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,

+                             const uint16_t* u_buf,

+                             const uint16_t* v_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,

+                             const uint16_t* u_buf,

+                             const uint16_t* v_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            const struct YuvConstants* yuvconstants,

+                            int width);

+void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf,

+                            const uint16_t* u_buf,

+                            const uint16_t* v_buf,

+                            uint8_t* dst_ptr,

+                            const struct YuvConstants* yuvconstants,

+                            int width);

+void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,

+                            const uint16_t* u_buf,

+                            const uint16_t* v_buf,

+                            uint8_t* dst_ptr,

+                            const struct YuvConstants* yuvconstants,

+                            int width);

+void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,

+                                  const uint8_t* u_buf,

+                                  const uint8_t* v_buf,

+                                  const uint8_t* a_buf,

+                                  uint8_t* dst_ptr,

                                   const struct YuvConstants* yuvconstants,

                                   int width);

-void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,

-                                 const uint8* u_buf,

-                                 const uint8* v_buf,

-                                 const uint8* a_buf,

-                                 uint8* dst_argb,

+void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,

+                                 const uint8_t* u_buf,

+                                 const uint8_t* v_buf,

+                                 const uint8_t* a_buf,

+                                 uint8_t* dst_ptr,

                                  const struct YuvConstants* yuvconstants,

                                  int width);

-void I411ToARGBRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

+void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I411ToARGBRow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* uv_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_uv,

-                             uint8* dst_argb,

+void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void NV12ToARGBRow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_uv,

-                            uint8* dst_argb,

+void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* uv_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_vu,

-                             uint8* dst_argb,

+void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              const struct YuvConstants* yuvconstants,

+                              int width);

+void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              const struct YuvConstants* yuvconstants,

+                              int width);

+void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void NV21ToARGBRow_Any_AVX2(const uint8* src_y,

-                            const uint8* src_vu,

-                            uint8* dst_argb,

-                            const struct YuvConstants* yuvconstants,

-                            int width);

-void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,

-                               const uint8* src_uv,

-                               uint8* dst_argb,

+void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,

+                               const uint8_t* uv_buf,

+                               uint8_t* dst_ptr,

                                const struct YuvConstants* yuvconstants,

                                int width);

-void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,

-                              const uint8* src_uv,

-                              uint8* dst_argb,

+void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

                               const struct YuvConstants* yuvconstants,

                               int width);

-void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,

-                             uint8* dst_argb,

+void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,

-                             uint8* dst_argb,

+void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,

-                            uint8* dst_argb,

+void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,

-                            uint8* dst_argb,

+void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToRGBARow_Any_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_rgba,

+void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,

-                                 const uint8* src_u,

-                                 const uint8* src_v,

-                                 uint8* dst_rgba,

+void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,

+                                 const uint8_t* u_buf,

+                                 const uint8_t* v_buf,

+                                 uint8_t* dst_ptr,

                                  const struct YuvConstants* yuvconstants,

                                  int width);

-void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,

-                                const uint8* src_u,

-                                const uint8* src_v,

-                                uint8* dst_rgba,

+void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_ptr,

                                 const struct YuvConstants* yuvconstants,

                                 int width);

-void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,

-                                 const uint8* src_u,

-                                 const uint8* src_v,

-                                 uint8* dst_rgba,

+void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,

+                                 const uint8_t* u_buf,

+                                 const uint8_t* v_buf,

+                                 uint8_t* dst_ptr,

                                  const struct YuvConstants* yuvconstants,

                                  int width);

-void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,

-                                const uint8* src_u,

-                                const uint8* src_v,

-                                uint8* dst_rgba,

+void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_ptr,

                                 const struct YuvConstants* yuvconstants,

                                 int width);

-void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,

-                               const uint8* src_u,

-                               const uint8* src_v,

-                               uint8* dst_rgba,

+void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,

+                               const uint8_t* u_buf,

+                               const uint8_t* v_buf,

+                               uint8_t* dst_ptr,

                                const struct YuvConstants* yuvconstants,

                                int width);

-void I422ToRGB565Row_Any_AVX2(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_rgba,

+void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,

+                              const uint8_t* u_buf,

+                              const uint8_t* v_buf,

+                              uint8_t* dst_ptr,

                               const struct YuvConstants* yuvconstants,

                               int width);

-void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

+void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,

+                              const uint8_t* u_buf,

+                              const uint8_t* v_buf,

+                              uint8_t* dst_ptr,

                               const struct YuvConstants* yuvconstants,

                               int width);

-void I422ToRGB24Row_Any_AVX2(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

+void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);

-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);

-void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);

-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);

-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);

-void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);

-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);

+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);

+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);

+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);

+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);

+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);

+void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

 // ARGB preattenuated alpha blend.

-void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,

-                        uint8* dst_argb, int width);

-void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,

-                       uint8* dst_argb, int width);

-void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,

-                    uint8* dst_argb, int width);

+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,

+                        const uint8_t* src_argb1,

+                        uint8_t* dst_argb,

+                        int width);

+void ARGBBlendRow_NEON(const uint8_t* src_argb0,

+                       const uint8_t* src_argb1,

+                       uint8_t* dst_argb,

+                       int width);

+void ARGBBlendRow_MSA(const uint8_t* src_argb0,

+                      const uint8_t* src_argb1,

+                      uint8_t* dst_argb,

+                      int width);

+void ARGBBlendRow_C(const uint8_t* src_argb0,

+                    const uint8_t* src_argb1,

+                    uint8_t* dst_argb,

+                    int width);

 // Unattenuated planar alpha blend.

-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

-                         const uint8* alpha, uint8* dst, int width);

-void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,

-                             const uint8* alpha, uint8* dst, int width);

-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

-                        const uint8* alpha, uint8* dst, int width);

-void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,

-                            const uint8* alpha, uint8* dst, int width);

-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,

-                     const uint8* alpha, uint8* dst, int width);

+void BlendPlaneRow_SSSE3(const uint8_t* src0,

+                         const uint8_t* src1,

+                         const uint8_t* alpha,

+                         uint8_t* dst,

+                         int width);

+void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

+                             int width);

+void BlendPlaneRow_AVX2(const uint8_t* src0,

+                        const uint8_t* src1,

+                        const uint8_t* alpha,

+                        uint8_t* dst,

+                        int width);

+void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            int width);

+void BlendPlaneRow_C(const uint8_t* src0,

+                     const uint8_t* src1,

+                     const uint8_t* alpha,

+                     uint8_t* dst,

+                     int width);

 // ARGB multiply images. Same API as Blend, but these require

 // pointer and width alignment for SSE2.

-void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,

-                       uint8* dst_argb, int width);

-void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

-                          uint8* dst_argb, int width);

-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,

-                              uint8* dst_argb, int width);

-void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,

-                          uint8* dst_argb, int width);

-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,

-                              uint8* dst_argb, int width);

-void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,

-                          uint8* dst_argb, int width);

-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,

-                              uint8* dst_argb, int width);

+void ARGBMultiplyRow_C(const uint8_t* src_argb0,

+                       const uint8_t* src_argb1,

+                       uint8_t* dst_argb,

+                       int width);

+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,

+                         const uint8_t* src_argb1,

+                         uint8_t* dst_argb,

+                         int width);

+void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

+                             int width);

 // ARGB add images.

-void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,

-                  uint8* dst_argb, int width);

-void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

-                     uint8* dst_argb, int width);

-void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,

-                         uint8* dst_argb, int width);

-void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,

-                     uint8* dst_argb, int width);

-void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,

-                         uint8* dst_argb, int width);

-void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,

-                     uint8* dst_argb, int width);

-void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,

-                         uint8* dst_argb, int width);

+void ARGBAddRow_C(const uint8_t* src_argb0,

+                  const uint8_t* src_argb1,

+                  uint8_t* dst_argb,

+                  int width);

+void ARGBAddRow_SSE2(const uint8_t* src_argb0,

+                     const uint8_t* src_argb1,

+                     uint8_t* dst_argb,

+                     int width);

+void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

+                         int width);

+void ARGBAddRow_AVX2(const uint8_t* src_argb0,

+                     const uint8_t* src_argb1,

+                     uint8_t* dst_argb,

+                     int width);

+void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

+                         int width);

+void ARGBAddRow_NEON(const uint8_t* src_argb0,

+                     const uint8_t* src_argb1,

+                     uint8_t* dst_argb,

+                     int width);

+void ARGBAddRow_Any_NEON(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

+                         int width);

+void ARGBAddRow_MSA(const uint8_t* src_argb0,

+                    const uint8_t* src_argb1,

+                    uint8_t* dst_argb,

+                    int width);

+void ARGBAddRow_Any_MSA(const uint8_t* y_buf,

+                        const uint8_t* uv_buf,

+                        uint8_t* dst_ptr,

+                        int width);

 // ARGB subtract images. Same API as Blend, but these require

 // pointer and width alignment for SSE2.

-void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,

-                       uint8* dst_argb, int width);

-void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,

-                          uint8* dst_argb, int width);

-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,

-                              uint8* dst_argb, int width);

-void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,

-                          uint8* dst_argb, int width);

-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,

-                              uint8* dst_argb, int width);

-void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,

-                          uint8* dst_argb, int width);

-void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,

-                              uint8* dst_argb, int width);

+void ARGBSubtractRow_C(const uint8_t* src_argb0,

+                       const uint8_t* src_argb1,

+                       uint8_t* dst_argb,

+                       int width);

+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,

+                         const uint8_t* src_argb1,

+                         uint8_t* dst_argb,

+                         int width);

+void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

+                             int width);

-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,

+void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,

+void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

+void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             int width);

+void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             int width);

+void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             int width);

+void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,

+                                   uint8_t* dst_ptr,

+                                   int width);

+void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,

+                                    uint8_t* dst_ptr,

+                                    const uint32_t param,

+                                    int width);

+void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,

+                                    uint8_t* dst_ptr,

+                                    const uint32_t param,

+                                    int width);

-void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,

-                                    const uint32 dither4, int width);

-void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,

-                                    const uint32 dither4, int width);

-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,

+void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,

+void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

+void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);

-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,

+void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             int width);

+void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

+                              int width);

+void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,

+void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,

-                                    const uint32 dither4, int width);

+void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr,

+                                    uint8_t* dst_ptr,

+                                    const uint32_t param,

+                                    int width);

+void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

+                            int width);

+void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

+                             int width);

+void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

+                               int width);

+void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

+                               int width);

+void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,

+                                   uint8_t* dst_ptr,

+                                   const uint32_t param,

+                                   int width);

-void I444ToARGBRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToARGBRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,

-                                 const uint8* src_u,

-                                 const uint8* src_v,

-                                 const uint8* src_a,

-                                 uint8* dst_argb,

+void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,

+                                 const uint8_t* u_buf,

+                                 const uint8_t* v_buf,

+                                 const uint8_t* a_buf,

+                                 uint8_t* dst_ptr,

                                  const struct YuvConstants* yuvconstants,

                                  int width);

-void I411ToARGBRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

+void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToRGBARow_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb,

-                            const struct YuvConstants* yuvconstants,

-                            int width);

-void I422ToRGB24Row_Any_NEON(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb,

+void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

                              const struct YuvConstants* yuvconstants,

                              int width);

-void I422ToARGB4444Row_Any_NEON(const uint8* src_y,

-                                const uint8* src_u,

-                                const uint8* src_v,

-                                uint8* dst_argb,

+void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_ptr,

                                 const struct YuvConstants* yuvconstants,

                                 int width);

-void I422ToARGB1555Row_Any_NEON(const uint8* src_y,

-                                const uint8* src_u,

-                                const uint8* src_v,

-                                uint8* dst_argb,

+void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_ptr,

                                 const struct YuvConstants* yuvconstants,

                                 int width);

-void I422ToRGB565Row_Any_NEON(const uint8* src_y,

-                              const uint8* src_u,

-                              const uint8* src_v,

-                              uint8* dst_argb,

+void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,

+                              const uint8_t* u_buf,

+                              const uint8_t* v_buf,

+                              uint8_t* dst_ptr,

                               const struct YuvConstants* yuvconstants,

                               int width);

-void NV12ToARGBRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_uv,

-                            uint8* dst_argb,

+void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,

+                            const uint8_t* uv_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void NV21ToARGBRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_vu,

-                            uint8* dst_argb,

+void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf,

+                            const uint8_t* uv_buf,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void NV12ToRGB565Row_Any_NEON(const uint8* src_y,

-                              const uint8* src_uv,

-                              uint8* dst_argb,

+void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

                               const struct YuvConstants* yuvconstants,

                               int width);

-void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,

-                            uint8* dst_argb,

+void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,

-                            uint8* dst_argb,

+void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

                             const struct YuvConstants* yuvconstants,

                             int width);

-void I422ToARGBRow_DSPR2(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

+void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,

+                           const uint8_t* u_buf,

+                           const uint8_t* v_buf,

+                           uint8_t* dst_ptr,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,

+                           const uint8_t* u_buf,

+                           const uint8_t* v_buf,

+                           uint8_t* dst_ptr,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,

+                           const uint8_t* u_buf,

+                           const uint8_t* v_buf,

+                           uint8_t* dst_ptr,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                const uint8_t* a_buf,

+                                uint8_t* dst_ptr,

+                                const struct YuvConstants* yuvconstants,

+                                int width);

+void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            const struct YuvConstants* yuvconstants,

+                            int width);

+void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,

+                             const uint8_t* u_buf,

+                             const uint8_t* v_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,

+                               const uint8_t* u_buf,

+                               const uint8_t* v_buf,

+                               uint8_t* dst_ptr,

+                               const struct YuvConstants* yuvconstants,

+                               int width);

+void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,

+                               const uint8_t* u_buf,

+                               const uint8_t* v_buf,

+                               uint8_t* dst_ptr,

+                               const struct YuvConstants* yuvconstants,

+                               int width);

+void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,

+                           const uint8_t* uv_buf,

+                           uint8_t* dst_ptr,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

+                             const struct YuvConstants* yuvconstants,

+                             int width);

+void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf,

+                           const uint8_t* uv_buf,

+                           uint8_t* dst_ptr,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr,

+                           uint8_t* dst_ptr,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,

+                           uint8_t* dst_ptr,

+                           const struct YuvConstants* yuvconstants,

+                           int width);

+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);

+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,

+                      int stride_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width);

-void I422ToARGBRow_DSPR2(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);

+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,

+                      int stride_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width);

+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width);

+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,

+                      int stride_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);

+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,

+                     int src_stride_yuy2,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width);

+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);

+void YUY2ToUVRow_C(const uint8_t* src_yuy2,

+                   int src_stride_yuy2,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

+                             int width);

+void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

+                             int width);

+void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

+                             int width);

+void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,

+                         int src_stride_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,

+                            uint8_t* dst_u,

+                            uint8_t* dst_v,

+                            int width);

+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);

+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);

+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);

+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width);

+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);

+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,

+                     int src_stride_uyvy,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width);

+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width);

-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);

-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);

-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);

-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);

-void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,

-                   uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToUV422Row_C(const uint8* src_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);

-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                          uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,

-                             uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);

-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                          uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,

-                             uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);

-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,

-                          uint8* dst_u, uint8* dst_v, int width);

-void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,

-                             uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_NEON(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width);

+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);

+void UYVYToUVRow_C(const uint8_t* src_uyvy,

+                   int src_stride_uyvy,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width);

+void UYVYToUV422Row_C(const uint8_t* src_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width);

+void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

+                             int width);

+void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

+                             int width);

+void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,

+                          int src_stride_ptr,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width);

+void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,

+                             uint8_t* dst_u,

+                             uint8_t* dst_v,

+                             int width);

+void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

+void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,

+                         int src_stride_ptr,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width);

+void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,

+                            uint8_t* dst_u,

+                            uint8_t* dst_v,

+                            int width);

-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,

-                   uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_C(const uint8* src_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                          uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,

-                             uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                          uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,

-                             uint8* dst_u, uint8* dst_v, int width);

-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);

-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,

-                          uint8* dst_u, uint8* dst_v, int width);

-void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,

-                             uint8* dst_u, uint8* dst_v, int width);

+void I422ToYUY2Row_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_frame,

+                     int width);

+void I422ToUYVYRow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_frame,

+                     int width);

+void I422ToYUY2Row_SSE2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_yuy2,

+                        int width);

+void I422ToUYVYRow_SSE2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_uyvy,

+                        int width);

+void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            int width);

+void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            int width);

+void I422ToYUY2Row_AVX2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_yuy2,

+                        int width);

+void I422ToUYVYRow_AVX2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_uyvy,

+                        int width);

+void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            int width);

+void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            int width);

+void I422ToYUY2Row_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_yuy2,

+                        int width);

+void I422ToUYVYRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_uyvy,

+                        int width);

+void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            int width);

+void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf,

+                            const uint8_t* u_buf,

+                            const uint8_t* v_buf,

+                            uint8_t* dst_ptr,

+                            int width);

+void I422ToYUY2Row_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_yuy2,

+                       int width);

+void I422ToUYVYRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_uyvy,

+                       int width);

+void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,

+                           const uint8_t* u_buf,

+                           const uint8_t* v_buf,

+                           uint8_t* dst_ptr,

+                           int width);

+void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,

+                           const uint8_t* u_buf,

+                           const uint8_t* v_buf,

+                           uint8_t* dst_ptr,

+                           int width);

-void I422ToYUY2Row_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_yuy2, int width);

-void I422ToUYVYRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_uyvy, int width);

-void I422ToYUY2Row_SSE2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_yuy2, int width);

-void I422ToUYVYRow_SSE2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_uyvy, int width);

-void I422ToYUY2Row_Any_SSE2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_yuy2, int width);

-void I422ToUYVYRow_Any_SSE2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_uyvy, int width);

-void I422ToYUY2Row_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_yuy2, int width);

-void I422ToUYVYRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_uyvy, int width);

-void I422ToYUY2Row_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_yuy2, int width);

-void I422ToUYVYRow_Any_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_uyvy, int width);

 // Effects related row functions.

-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

-                               int width);

-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,

+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);

+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            int width);

+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,

+                           uint8_t* dst_argb,

+                           int width);

+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,

+                           uint8_t* dst_argb,

+                           int width);

+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,

+                                uint8_t* dst_ptr,

                                 int width);

-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

+void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

                                int width);

-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,

+void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,

+                               uint8_t* dst_ptr,

                                int width);

+void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,

+                              uint8_t* dst_ptr,

+                              int width);

 // Inverse table for unattenuate, shared by C and SSE2.

-extern const uint32 fixed_invtbl8[256];

-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,

+extern const uint32_t fixed_invtbl8[256];

+void ARGBUnattenuateRow_C(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          int width);

+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

+                             int width);

+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

+                             int width);

+void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr,

+                                 uint8_t* dst_ptr,

                                  int width);

-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,

+void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,

+                                 uint8_t* dst_ptr,

                                  int width);

-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);

-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);

+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);

+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);

+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);

+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);

-void ARGBSepiaRow_C(uint8* dst_argb, int width);

-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);

-void ARGBSepiaRow_NEON(uint8* dst_argb, int width);

+void ARGBSepiaRow_C(uint8_t* dst_argb, int width);

+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);

+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);

+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);

-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,

-                          const int8* matrix_argb, int width);

-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                              const int8* matrix_argb, int width);

-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                             const int8* matrix_argb, int width);

+void ARGBColorMatrixRow_C(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          const int8_t* matrix_argb,

+                          int width);

+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,

+                              uint8_t* dst_argb,

+                              const int8_t* matrix_argb,

+                              int width);

+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

+                             const int8_t* matrix_argb,

+                             int width);

+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            const int8_t* matrix_argb,

+                            int width);

-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);

-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);

+void ARGBColorTableRow_C(uint8_t* dst_argb,

+                         const uint8_t* table_argb,

+                         int width);

+void ARGBColorTableRow_X86(uint8_t* dst_argb,

+                           const uint8_t* table_argb,

+                           int width);

-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);

-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);

+void RGBColorTableRow_C(uint8_t* dst_argb,

+                        const uint8_t* table_argb,

+                        int width);

+void RGBColorTableRow_X86(uint8_t* dst_argb,

+                          const uint8_t* table_argb,

+                          int width);

-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,

-                       int interval_offset, int width);

-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,

-                          int interval_offset, int width);

-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,

-                          int interval_offset, int width);

+void ARGBQuantizeRow_C(uint8_t* dst_argb,

+                       int scale,

+                       int interval_size,

+                       int interval_offset,

+                       int width);

+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,

+                          int scale,

+                          int interval_size,

+                          int interval_offset,

+                          int width);

+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,

+                          int scale,

+                          int interval_size,

+                          int interval_offset,

+                          int width);

+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,

+                         int scale,

+                         int interval_size,

+                         int interval_offset,

+                         int width);

-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,

-                    uint32 value);

-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,

-                       uint32 value);

-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,

-                       uint32 value);

+void ARGBShadeRow_C(const uint8_t* src_argb,

+                    uint8_t* dst_argb,

+                    int width,

+                    uint32_t value);

+void ARGBShadeRow_SSE2(const uint8_t* src_argb,

+                       uint8_t* dst_argb,

+                       int width,

+                       uint32_t value);

+void ARGBShadeRow_NEON(const uint8_t* src_argb,

+                       uint8_t* dst_argb,

+                       int width,

+                       uint32_t value);

+void ARGBShadeRow_MSA(const uint8_t* src_argb,

+                      uint8_t* dst_argb,

+                      int width,

+                      uint32_t value);

 // Used for blur.

-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,

-                                    int width, int area, uint8* dst, int count);

-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,

-                                  const int32* previous_cumsum, int width);

+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,

+                                    const int32_t* botleft,

+                                    int width,

+                                    int area,

+                                    uint8_t* dst,

+                                    int count);

+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,

+                                  int32_t* cumsum,

+                                  const int32_t* previous_cumsum,

+                                  int width);

-void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,

-                                 int width, int area, uint8* dst, int count);

-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,

-                               const int32* previous_cumsum, int width);

+void CumulativeSumToAverageRow_C(const int32_t* tl,

+                                 const int32_t* bl,

+                                 int w,

+                                 int area,

+                                 uint8_t* dst,

+                                 int count);

+void ComputeCumulativeSumRow_C(const uint8_t* row,

+                               int32_t* cumsum,

+                               const int32_t* previous_cumsum,

+                               int width);

 LIBYUV_API

-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,

-                     uint8* dst_argb, const float* uv_dudv, int width);

+void ARGBAffineRow_C(const uint8_t* src_argb,

+                     int src_argb_stride,

+                     uint8_t* dst_argb,

+                     const float* uv_dudv,

+                     int width);

 LIBYUV_API

-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

-                        uint8* dst_argb, const float* uv_dudv, int width);

+void ARGBAffineRow_SSE2(const uint8_t* src_argb,

+                        int src_argb_stride,

+                        uint8_t* dst_argb,

+                        const float* src_dudv,

+                        int width);

 // Used for I420Scale, ARGBScale, and ARGBInterpolate.

-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,

-                      ptrdiff_t src_stride_ptr,

-                      int width, int source_y_fraction);

-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                          ptrdiff_t src_stride_ptr, int width,

+void InterpolateRow_C(uint8_t* dst_ptr,

+                      const uint8_t* src_ptr,

+                      ptrdiff_t src_stride,

+                      int width,

+                      int source_y_fraction);

+void InterpolateRow_SSSE3(uint8_t* dst_ptr,

+                          const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          int dst_width,

                           int source_y_fraction);

-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,

-                         ptrdiff_t src_stride_ptr, int width,

+void InterpolateRow_AVX2(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         int dst_width,

                          int source_y_fraction);

-void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,

-                         ptrdiff_t src_stride_ptr, int width,

+void InterpolateRow_NEON(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         int dst_width,

                          int source_y_fraction);

-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

-                          ptrdiff_t src_stride_ptr, int width,

-                          int source_y_fraction);

-void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,

-                             ptrdiff_t src_stride_ptr, int width,

+void InterpolateRow_MSA(uint8_t* dst_ptr,

+                        const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        int width,

+                        int source_y_fraction);

+void InterpolateRow_Any_NEON(uint8_t* dst_ptr,

+                             const uint8_t* src_ptr,

+                             ptrdiff_t src_stride_ptr,

+                             int width,

                              int source_y_fraction);

-void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                              ptrdiff_t src_stride_ptr, int width,

+void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,

+                              const uint8_t* src_ptr,

+                              ptrdiff_t src_stride_ptr,

+                              int width,

                               int source_y_fraction);

-void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,

-                             ptrdiff_t src_stride_ptr, int width,

+void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,

+                             const uint8_t* src_ptr,

+                             ptrdiff_t src_stride_ptr,

+                             int width,

                              int source_y_fraction);

-void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

-                              ptrdiff_t src_stride_ptr, int width,

-                              int source_y_fraction);

+void InterpolateRow_Any_MSA(uint8_t* dst_ptr,

+                            const uint8_t* src_ptr,

+                            ptrdiff_t src_stride_ptr,

+                            int width,

+                            int source_y_fraction);

-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                         ptrdiff_t src_stride_ptr,

-                         int width, int source_y_fraction);

+void InterpolateRow_16_C(uint16_t* dst_ptr,

+                         const uint16_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         int width,

+                         int source_y_fraction);

 // Sobel images.

-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,

-                 uint8* dst_sobelx, int width);

-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,

-                    const uint8* src_y2, uint8* dst_sobelx, int width);

-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

-                    const uint8* src_y2, uint8* dst_sobelx, int width);

-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,

-                 uint8* dst_sobely, int width);

-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,

-                    uint8* dst_sobely, int width);

-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

-                    uint8* dst_sobely, int width);

-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,

-                uint8* dst_argb, int width);

-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                   uint8* dst_argb, int width);

-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                   uint8* dst_argb, int width);

-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,

-                       uint8* dst_y, int width);

-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                          uint8* dst_y, int width);

-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                          uint8* dst_y, int width);

-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,

-                  uint8* dst_argb, int width);

-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width);

-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width);

-void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                       uint8* dst_argb, int width);

-void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                       uint8* dst_argb, int width);

-void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                              uint8* dst_y, int width);

-void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                              uint8* dst_y, int width);

-void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                         uint8* dst_argb, int width);

-void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                         uint8* dst_argb, int width);

+void SobelXRow_C(const uint8_t* src_y0,

+                 const uint8_t* src_y1,

+                 const uint8_t* src_y2,

+                 uint8_t* dst_sobelx,

+                 int width);

+void SobelXRow_SSE2(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    const uint8_t* src_y2,

+                    uint8_t* dst_sobelx,

+                    int width);

+void SobelXRow_NEON(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    const uint8_t* src_y2,

+                    uint8_t* dst_sobelx,

+                    int width);

+void SobelXRow_MSA(const uint8_t* src_y0,

+                   const uint8_t* src_y1,

+                   const uint8_t* src_y2,

+                   uint8_t* dst_sobelx,

+                   int width);

+void SobelYRow_C(const uint8_t* src_y0,

+                 const uint8_t* src_y1,

+                 uint8_t* dst_sobely,

+                 int width);

+void SobelYRow_SSE2(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    uint8_t* dst_sobely,

+                    int width);

+void SobelYRow_NEON(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    uint8_t* dst_sobely,

+                    int width);

+void SobelYRow_MSA(const uint8_t* src_y0,

+                   const uint8_t* src_y1,

+                   uint8_t* dst_sobely,

+                   int width);

+void SobelRow_C(const uint8_t* src_sobelx,

+                const uint8_t* src_sobely,

+                uint8_t* dst_argb,

+                int width);

+void SobelRow_SSE2(const uint8_t* src_sobelx,

+                   const uint8_t* src_sobely,

+                   uint8_t* dst_argb,

+                   int width);

+void SobelRow_NEON(const uint8_t* src_sobelx,

+                   const uint8_t* src_sobely,

+                   uint8_t* dst_argb,

+                   int width);

+void SobelRow_MSA(const uint8_t* src_sobelx,

+                  const uint8_t* src_sobely,

+                  uint8_t* dst_argb,

+                  int width);

+void SobelToPlaneRow_C(const uint8_t* src_sobelx,

+                       const uint8_t* src_sobely,

+                       uint8_t* dst_y,

+                       int width);

+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,

+                          const uint8_t* src_sobely,

+                          uint8_t* dst_y,

+                          int width);

+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,

+                          const uint8_t* src_sobely,

+                          uint8_t* dst_y,

+                          int width);

+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,

+                         const uint8_t* src_sobely,

+                         uint8_t* dst_y,

+                         int width);

+void SobelXYRow_C(const uint8_t* src_sobelx,

+                  const uint8_t* src_sobely,

+                  uint8_t* dst_argb,

+                  int width);

+void SobelXYRow_SSE2(const uint8_t* src_sobelx,

+                     const uint8_t* src_sobely,

+                     uint8_t* dst_argb,

+                     int width);

+void SobelXYRow_NEON(const uint8_t* src_sobelx,

+                     const uint8_t* src_sobely,

+                     uint8_t* dst_argb,

+                     int width);

+void SobelXYRow_MSA(const uint8_t* src_sobelx,

+                    const uint8_t* src_sobely,

+                    uint8_t* dst_argb,

+                    int width);

+void SobelRow_Any_SSE2(const uint8_t* y_buf,

+                       const uint8_t* uv_buf,

+                       uint8_t* dst_ptr,

+                       int width);

+void SobelRow_Any_NEON(const uint8_t* y_buf,

+                       const uint8_t* uv_buf,

+                       uint8_t* dst_ptr,

+                       int width);

+void SobelRow_Any_MSA(const uint8_t* y_buf,

+                      const uint8_t* uv_buf,

+                      uint8_t* dst_ptr,

+                      int width);

+void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf,

+                              const uint8_t* uv_buf,

+                              uint8_t* dst_ptr,

+                              int width);

+void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,

+                             const uint8_t* uv_buf,

+                             uint8_t* dst_ptr,

+                             int width);

+void SobelXYRow_Any_SSE2(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

+                         int width);

+void SobelXYRow_Any_NEON(const uint8_t* y_buf,

+                         const uint8_t* uv_buf,

+                         uint8_t* dst_ptr,

+                         int width);

+void SobelXYRow_Any_MSA(const uint8_t* y_buf,

+                        const uint8_t* uv_buf,

+                        uint8_t* dst_ptr,

+                        int width);

-void ARGBPolynomialRow_C(const uint8* src_argb,

-                         uint8* dst_argb, const float* poly,

+void ARGBPolynomialRow_C(const uint8_t* src_argb,

+                         uint8_t* dst_argb,

+                         const float* poly,

                          int width);

-void ARGBPolynomialRow_SSE2(const uint8* src_argb,

-                            uint8* dst_argb, const float* poly,

+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            const float* poly,

                             int width);

-void ARGBPolynomialRow_AVX2(const uint8* src_argb,

-                            uint8* dst_argb, const float* poly,

+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            const float* poly,

                             int width);

-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,

-                             const uint8* luma, uint32 lumacoeff);

-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+// Scale and convert to half float.

+void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);

+void HalfFloatRow_SSE2(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width);

+void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,

+                           uint16_t* dst_ptr,

+                           float param,

+                           int width);

+void HalfFloatRow_AVX2(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width);

+void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr,

+                           uint16_t* dst_ptr,

+                           float param,

+                           int width);

+void HalfFloatRow_F16C(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width);

+void HalfFloatRow_Any_F16C(const uint16_t* src,

+                           uint16_t* dst,

+                           float scale,

+                           int width);

+void HalfFloat1Row_F16C(const uint16_t* src,

+                        uint16_t* dst,

+                        float scale,

+                        int width);

+void HalfFloat1Row_Any_F16C(const uint16_t* src,

+                            uint16_t* dst,

+                            float scale,

+                            int width);

+void HalfFloatRow_NEON(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width);

+void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,

+                           uint16_t* dst_ptr,

+                           float param,

+                           int width);

+void HalfFloat1Row_NEON(const uint16_t* src,

+                        uint16_t* dst,

+                        float scale,

+                        int width);

+void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,

+                            uint16_t* dst_ptr,

+                            float param,

+                            int width);

+void HalfFloatRow_MSA(const uint16_t* src,

+                      uint16_t* dst,

+                      float scale,

+                      int width);

+void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,

+                          uint16_t* dst_ptr,

+                          float param,

+                          int width);

+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);

+void ByteToFloatRow_NEON(const uint8_t* src,

+                         float* dst,

+                         float scale,

+                         int width);

+void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,

+                             float* dst_ptr,

+                             float param,

+                             int width);

+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

+                             int width,

+                             const uint8_t* luma,

+                             uint32_t lumacoeff);

+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,

+                                 uint8_t* dst_argb,

                                  int width,

-                                 const uint8* luma, uint32 lumacoeff);

+                                 const uint8_t* luma,

+                                 uint32_t lumacoeff);

+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);

+float ScaleMaxSamples_NEON(const float* src,

+                           float* dst,

+                           float scale,

+                           int width);

+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);

+float ScaleSumSamples_NEON(const float* src,

+                           float* dst,

+                           float scale,

+                           int width);

+void ScaleSamples_C(const float* src, float* dst, float scale, int width);

+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);

 #ifdef __cplusplus

 }  // extern "C"

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_ROW_H_

--- a/third_party/libyuv/include/libyuv/scale.h

+++ b/third_party/libyuv/include/libyuv/scale.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_SCALE_H_

 #define INCLUDE_LIBYUV_SCALE_H_

 #include "libyuv/basic_types.h"

@@ -20,25 +20,33 @@

 // Supported filtering.

 typedef enum FilterMode {

-  kFilterNone = 0,  // Point sample; Fastest.

-  kFilterLinear = 1,  // Filter horizontally only.

+  kFilterNone = 0,      // Point sample; Fastest.

+  kFilterLinear = 1,    // Filter horizontally only.

   kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.

-  kFilterBox = 3  // Highest quality.

+  kFilterBox = 3        // Highest quality.

 } FilterModeEnum;

 // Scale a YUV plane.

 LIBYUV_API

-void ScalePlane(const uint8* src, int src_stride,

-                int src_width, int src_height,

-                uint8* dst, int dst_stride,

-                int dst_width, int dst_height,

+void ScalePlane(const uint8_t* src,

+                int src_stride,

+                int src_width,

+                int src_height,

+                uint8_t* dst,

+                int dst_stride,

+                int dst_width,

+                int dst_height,

                 enum FilterMode filtering);

 LIBYUV_API

-void ScalePlane_16(const uint16* src, int src_stride,

-                   int src_width, int src_height,

-                   uint16* dst, int dst_stride,

-                   int dst_width, int dst_height,

+void ScalePlane_16(const uint16_t* src,

+                   int src_stride,

+                   int src_width,

+                   int src_height,

+                   uint16_t* dst,

+                   int dst_stride,

+                   int dst_width,

+                   int dst_height,

                    enum FilterMode filtering);

 // Scales a YUV 4:2:0 image from the src width and height to the

@@ -52,44 +60,64 @@

 // Returns 0 if successful.

 LIBYUV_API

-int I420Scale(const uint8* src_y, int src_stride_y,

-              const uint8* src_u, int src_stride_u,

-              const uint8* src_v, int src_stride_v,

-              int src_width, int src_height,

-              uint8* dst_y, int dst_stride_y,

-              uint8* dst_u, int dst_stride_u,

-              uint8* dst_v, int dst_stride_v,

-              int dst_width, int dst_height,

+int I420Scale(const uint8_t* src_y,

+              int src_stride_y,

+              const uint8_t* src_u,

+              int src_stride_u,

+              const uint8_t* src_v,

+              int src_stride_v,

+              int src_width,

+              int src_height,

+              uint8_t* dst_y,

+              int dst_stride_y,

+              uint8_t* dst_u,

+              int dst_stride_u,

+              uint8_t* dst_v,

+              int dst_stride_v,

+              int dst_width,

+              int dst_height,

               enum FilterMode filtering);

 LIBYUV_API

-int I420Scale_16(const uint16* src_y, int src_stride_y,

-                 const uint16* src_u, int src_stride_u,

-                 const uint16* src_v, int src_stride_v,

-                 int src_width, int src_height,

-                 uint16* dst_y, int dst_stride_y,

-                 uint16* dst_u, int dst_stride_u,

-                 uint16* dst_v, int dst_stride_v,

-                 int dst_width, int dst_height,

+int I420Scale_16(const uint16_t* src_y,

+                 int src_stride_y,

+                 const uint16_t* src_u,

+                 int src_stride_u,

+                 const uint16_t* src_v,

+                 int src_stride_v,

+                 int src_width,

+                 int src_height,

+                 uint16_t* dst_y,

+                 int dst_stride_y,

+                 uint16_t* dst_u,

+                 int dst_stride_u,

+                 uint16_t* dst_v,

+                 int dst_stride_v,

+                 int dst_width,

+                 int dst_height,

                  enum FilterMode filtering);

 #ifdef __cplusplus

 // Legacy API.  Deprecated.

 LIBYUV_API

-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,

-          int src_stride_y, int src_stride_u, int src_stride_v,

-          int src_width, int src_height,

-          uint8* dst_y, uint8* dst_u, uint8* dst_v,

-          int dst_stride_y, int dst_stride_u, int dst_stride_v,

-          int dst_width, int dst_height,

+int Scale(const uint8_t* src_y,

+          const uint8_t* src_u,

+          const uint8_t* src_v,

+          int src_stride_y,

+          int src_stride_u,

+          int src_stride_v,

+          int src_width,

+          int src_height,

+          uint8_t* dst_y,

+          uint8_t* dst_u,

+          uint8_t* dst_v,

+          int dst_stride_y,

+          int dst_stride_u,

+          int dst_stride_v,

+          int dst_width,

+          int dst_height,

           LIBYUV_BOOL interpolate);

-// Legacy API.  Deprecated.

-LIBYUV_API

-int ScaleOffset(const uint8* src_i420, int src_width, int src_height,

-                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,

-                LIBYUV_BOOL interpolate);

 // For testing, allow disabling of specialized scalers.

 LIBYUV_API

 void SetUseReferenceImpl(LIBYUV_BOOL use);

@@ -100,4 +128,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_SCALE_H_

--- a/third_party/libyuv/include/libyuv/scale_argb.h

+++ b/third_party/libyuv/include/libyuv/scale_argb.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_

 #define INCLUDE_LIBYUV_SCALE_ARGB_H_

 #include "libyuv/basic_types.h"

@@ -20,32 +20,52 @@

 #endif

 LIBYUV_API

-int ARGBScale(const uint8* src_argb, int src_stride_argb,

-              int src_width, int src_height,

-              uint8* dst_argb, int dst_stride_argb,

-              int dst_width, int dst_height,

+int ARGBScale(const uint8_t* src_argb,

+              int src_stride_argb,

+              int src_width,

+              int src_height,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int dst_width,

+              int dst_height,

               enum FilterMode filtering);

 // Clipped scale takes destination rectangle coordinates for clip values.

 LIBYUV_API

-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,

-                  int src_width, int src_height,

-                  uint8* dst_argb, int dst_stride_argb,

-                  int dst_width, int dst_height,

-                  int clip_x, int clip_y, int clip_width, int clip_height,

+int ARGBScaleClip(const uint8_t* src_argb,

+                  int src_stride_argb,

+                  int src_width,

+                  int src_height,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int dst_width,

+                  int dst_height,

+                  int clip_x,

+                  int clip_y,

+                  int clip_width,

+                  int clip_height,

                   enum FilterMode filtering);

 // Scale with YUV conversion to ARGB and clipping.

 LIBYUV_API

-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

-                       const uint8* src_u, int src_stride_u,

-                       const uint8* src_v, int src_stride_v,

-                       uint32 src_fourcc,

-                       int src_width, int src_height,

-                       uint8* dst_argb, int dst_stride_argb,

-                       uint32 dst_fourcc,

-                       int dst_width, int dst_height,

-                       int clip_x, int clip_y, int clip_width, int clip_height,

+int YUVToARGBScaleClip(const uint8_t* src_y,

+                       int src_stride_y,

+                       const uint8_t* src_u,

+                       int src_stride_u,

+                       const uint8_t* src_v,

+                       int src_stride_v,

+                       uint32_t src_fourcc,

+                       int src_width,

+                       int src_height,

+                       uint8_t* dst_argb,

+                       int dst_stride_argb,

+                       uint32_t dst_fourcc,

+                       int dst_width,

+                       int dst_height,

+                       int clip_x,

+                       int clip_y,

+                       int clip_width,

+                       int clip_height,

                        enum FilterMode filtering);

 #ifdef __cplusplus

@@ -53,4 +73,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_

--- a/third_party/libyuv/include/libyuv/scale_row.h

+++ b/third_party/libyuv/include/libyuv/scale_row.h

@@ -8,7 +8,7 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_

 #define INCLUDE_LIBYUV_SCALE_ROW_H_

 #include "libyuv/basic_types.h"

@@ -19,10 +19,14 @@

 extern "C" {

 #endif

-#if defined(__pnacl__) || defined(__CLR_VER) || \

-    (defined(__i386__) && !defined(__SSE2__))

+#if defined(__pnacl__) || defined(__CLR_VER) ||            \

+    (defined(__native_client__) && defined(__x86_64__)) || \

+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))

 #define LIBYUV_DISABLE_X86

 #endif

+#if defined(__native_client__)

+#define LIBYUV_DISABLE_NEON

+#endif

 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505

 #if defined(__has_feature)

 #if __has_feature(memory_sanitizer)

@@ -29,7 +33,6 @@

 #define LIBYUV_DISABLE_X86

 #endif

 #endif

 // GCC >= 4.7.0 required for AVX2.

 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))

 #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))

@@ -45,8 +48,8 @@

 #endif  // __clang__

 // Visual C 2012 required for AVX2.

-#if defined(_M_IX86) && !defined(__clang__) && \

-    defined(_MSC_VER) && _MSC_VER >= 1700

+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \

+    _MSC_VER >= 1700

 #define VISUALC_HAS_AVX2 1

 #endif  // VisualStudio >= 2012

@@ -72,8 +75,9 @@

 // The following are available on all x86 platforms, but

 // require VS2012, clang 3.4 or gcc 4.7.

 // The code supports NaCL but requires a new compiler and validator.

-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \

-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))

+#if !defined(LIBYUV_DISABLE_X86) &&                          \

+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \

+     defined(GCC_HAS_AVX2))

 #define HAS_SCALEADDROW_AVX2

 #define HAS_SCALEROWDOWN2_AVX2

 #define HAS_SCALEROWDOWN4_AVX2

@@ -80,7 +84,7 @@

 #endif

 // The following are available on Neon platforms:

-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

+#if !defined(LIBYUV_DISABLE_NEON) && \

     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))

 #define HAS_SCALEARGBCOLS_NEON

 #define HAS_SCALEARGBROWDOWN2_NEON

@@ -93,33 +97,51 @@

 #define HAS_SCALEARGBFILTERCOLS_NEON

 #endif

-// The following are available on Mips platforms:

-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \

-    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#define HAS_SCALEROWDOWN2_DSPR2

-#define HAS_SCALEROWDOWN4_DSPR2

-#define HAS_SCALEROWDOWN34_DSPR2

-#define HAS_SCALEROWDOWN38_DSPR2

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#define HAS_SCALEADDROW_MSA

+#define HAS_SCALEARGBCOLS_MSA

+#define HAS_SCALEARGBFILTERCOLS_MSA

+#define HAS_SCALEARGBROWDOWN2_MSA

+#define HAS_SCALEARGBROWDOWNEVEN_MSA

+#define HAS_SCALEFILTERCOLS_MSA

+#define HAS_SCALEROWDOWN2_MSA

+#define HAS_SCALEROWDOWN34_MSA

+#define HAS_SCALEROWDOWN38_MSA

+#define HAS_SCALEROWDOWN4_MSA

 #endif

 // Scale ARGB vertically with bilinear interpolation.

 void ScalePlaneVertical(int src_height,

-                        int dst_width, int dst_height,

-                        int src_stride, int dst_stride,

-                        const uint8* src_argb, uint8* dst_argb,

-                        int x, int y, int dy,

-                        int bpp, enum FilterMode filtering);

+                        int dst_width,

+                        int dst_height,

+                        int src_stride,

+                        int dst_stride,

+                        const uint8_t* src_argb,

+                        uint8_t* dst_argb,

+                        int x,

+                        int y,

+                        int dy,

+                        int bpp,

+                        enum FilterMode filtering);

 void ScalePlaneVertical_16(int src_height,

-                           int dst_width, int dst_height,

-                           int src_stride, int dst_stride,

-                           const uint16* src_argb, uint16* dst_argb,

-                           int x, int y, int dy,

-                           int wpp, enum FilterMode filtering);

+                           int dst_width,

+                           int dst_height,

+                           int src_stride,

+                           int dst_stride,

+                           const uint16_t* src_argb,

+                           uint16_t* dst_argb,

+                           int x,

+                           int y,

+                           int dy,

+                           int wpp,

+                           enum FilterMode filtering);

 // Simplify the filtering based on scale factors.

-enum FilterMode ScaleFilterReduce(int src_width, int src_height,

-                                  int dst_width, int dst_height,

+enum FilterMode ScaleFilterReduce(int src_width,

+                                  int src_height,

+                                  int dst_width,

+                                  int dst_height,

                                   enum FilterMode filtering);

 // Divide num by div and return as 16.16 fixed point result.

@@ -137,367 +159,786 @@

 #endif

 // Compute slope values for stepping.

-void ScaleSlope(int src_width, int src_height,

-                int dst_width, int dst_height,

+void ScaleSlope(int src_width,

+                int src_height,

+                int dst_width,

+                int dst_height,

                 enum FilterMode filtering,

-                int* x, int* y, int* dx, int* dy);

+                int* x,

+                int* y,

+                int* dx,

+                int* dy);

-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                     uint8* dst, int dst_width);

-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                        uint16* dst, int dst_width);

-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst, int dst_width);

-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                              uint16* dst, int dst_width);

-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst, int dst_width);

-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width);

-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst, int dst_width);

-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                     uint8* dst, int dst_width);

-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                        uint16* dst, int dst_width);

-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst, int dst_width);

-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst, int dst_width);

-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                      uint8* dst, int dst_width);

-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                         uint16* dst, int dst_width);

-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* d, int dst_width);

-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                               uint16* d, int dst_width);

-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* d, int dst_width);

-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                               uint16* d, int dst_width);

-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,

-                 int dst_width, int x, int dx);

-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                    int dst_width, int x, int dx);

-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,

-                    int dst_width, int, int);

-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                       int dst_width, int, int);

-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,

-                       int dst_width, int x, int dx);

-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                          int dst_width, int x, int dx);

-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,

-                         int dst_width, int x, int dx);

-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                            int dst_width, int x, int dx);

-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                      uint8* dst, int dst_width);

-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                         uint16* dst, int dst_width);

-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,

+void ScaleRowDown2_C(const uint8_t* src_ptr,

+                     ptrdiff_t src_stride,

+                     uint8_t* dst,

+                     int dst_width);

+void ScaleRowDown2_16_C(const uint16_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint16_t* dst,

+                        int dst_width);

+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst,

+                           int dst_width);

+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint16_t* dst,

+                              int dst_width);

+void ScaleRowDown2Box_C(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width);

+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,

                             ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,

+                            uint8_t* dst,

+                            int dst_width);

+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint16_t* dst,

+                           int dst_width);

+void ScaleRowDown4_C(const uint8_t* src_ptr,

+                     ptrdiff_t src_stride,

+                     uint8_t* dst,

+                     int dst_width);

+void ScaleRowDown4_16_C(const uint16_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint16_t* dst,

+                        int dst_width);

+void ScaleRowDown4Box_C(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width);

+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint16_t* dst,

+                           int dst_width);

+void ScaleRowDown34_C(const uint8_t* src_ptr,

+                      ptrdiff_t src_stride,

+                      uint8_t* dst,

+                      int dst_width);

+void ScaleRowDown34_16_C(const uint16_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint16_t* dst,

+                         int dst_width);

+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* d,

+                            int dst_width);

+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint16* dst_ptr, int dst_width);

-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                               uint16* dst_ptr, int dst_width);

-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);

-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);

-void ScaleARGBRowDown2_C(const uint8* src_argb,

+                               uint16_t* d,

+                               int dst_width);

+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* d,

+                            int dst_width);

+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint16_t* d,

+                               int dst_width);

+void ScaleCols_C(uint8_t* dst_ptr,

+                 const uint8_t* src_ptr,

+                 int dst_width,

+                 int x,

+                 int dx);

+void ScaleCols_16_C(uint16_t* dst_ptr,

+                    const uint16_t* src_ptr,

+                    int dst_width,

+                    int x,

+                    int dx);

+void ScaleColsUp2_C(uint8_t* dst_ptr,

+                    const uint8_t* src_ptr,

+                    int dst_width,

+                    int,

+                    int);

+void ScaleColsUp2_16_C(uint16_t* dst_ptr,

+                       const uint16_t* src_ptr,

+                       int dst_width,

+                       int,

+                       int);

+void ScaleFilterCols_C(uint8_t* dst_ptr,

+                       const uint8_t* src_ptr,

+                       int dst_width,

+                       int x,

+                       int dx);

+void ScaleFilterCols_16_C(uint16_t* dst_ptr,

+                          const uint16_t* src_ptr,

+                          int dst_width,

+                          int x,

+                          int dx);

+void ScaleFilterCols64_C(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         int dst_width,

+                         int x32,

+                         int dx);

+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,

+                            const uint16_t* src_ptr,

+                            int dst_width,

+                            int x32,

+                            int dx);

+void ScaleRowDown38_C(const uint8_t* src_ptr,

+                      ptrdiff_t src_stride,

+                      uint8_t* dst,

+                      int dst_width);

+void ScaleRowDown38_16_C(const uint16_t* src_ptr,

                          ptrdiff_t src_stride,

-                         uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,

+                         uint16_t* dst,

+                         int dst_width);

+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,

-                            uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,

+                               uint16_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint16_t* dst_ptr,

+                               int dst_width);

+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);

+void ScaleAddRow_16_C(const uint16_t* src_ptr,

+                      uint32_t* dst_ptr,

+                      int src_width);

+void ScaleARGBRowDown2_C(const uint8_t* src_argb,

+                         ptrdiff_t src_stride,

+                         uint8_t* dst_argb,

+                         int dst_width);

+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_argb,

+                               int dst_width);

+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_argb,

+                            int dst_width);

+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,

+                            ptrdiff_t src_stride,

                             int src_stepx,

-                            uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,

+                            uint8_t* dst_argb,

+                            int dst_width);

+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,

                                ptrdiff_t src_stride,

                                int src_stepx,

-                               uint8* dst_argb, int dst_width);

-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,

-                     int dst_width, int x, int dx);

-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,

-                       int dst_width, int x, int dx);

-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int, int);

-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,

-                           int dst_width, int x, int dx);

-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,

-                             int dst_width, int x, int dx);

+                               uint8_t* dst_argb,

+                               int dst_width);

+void ScaleARGBCols_C(uint8_t* dst_argb,

+                     const uint8_t* src_argb,

+                     int dst_width,

+                     int x,

+                     int dx);

+void ScaleARGBCols64_C(uint8_t* dst_argb,

+                       const uint8_t* src_argb,

+                       int dst_width,

+                       int x32,

+                       int dx);

+void ScaleARGBColsUp2_C(uint8_t* dst_argb,

+                        const uint8_t* src_argb,

+                        int dst_width,

+                        int,

+                        int);

+void ScaleARGBFilterCols_C(uint8_t* dst_argb,

+                           const uint8_t* src_argb,

+                           int dst_width,

+                           int x,

+                           int dx);

+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,

+                             const uint8_t* src_argb,

+                             int dst_width,

+                             int x32,

+                             int dx);

 // Specialized scalers for x86.

-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8_t* dst_ptr,

+                         int dst_width);

+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst_ptr,

+                        int dst_width);

+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width);

+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width);

+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8_t* dst_ptr,

+                         int dst_width);

+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst_ptr,

+                        int dst_width);

+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width);

-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst_ptr,

+                          int dst_width);

+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst_ptr,

+                          int dst_width);

+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                             uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                                   uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                  uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                             uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,

+                             ptrdiff_t src_stride,

+                             uint8_t* dst_ptr,

+                             int dst_width);

+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

+void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

+void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,

+                             ptrdiff_t src_stride,

+                             uint8_t* dst_ptr,

+                             int dst_width);

+void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,

+void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width);

+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,

                                     ptrdiff_t src_stride,

-                                    uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,

+                                    uint8_t* dst_ptr,

+                                    int dst_width);

+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,

                                     ptrdiff_t src_stride,

-                                    uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,

+                                    uint8_t* dst_ptr,

+                                    int dst_width);

+void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width);

+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,

                                     ptrdiff_t src_stride,

-                                    uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,

+                                    uint8_t* dst_ptr,

+                                    int dst_width);

+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,

                                     ptrdiff_t src_stride,

-                                    uint8* dst_ptr, int dst_width);

+                                    uint8_t* dst_ptr,

+                                    int dst_width);

-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);

-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);

-void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);

-void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);

+void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);

+void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);

+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,

+                          uint16_t* dst_ptr,

+                          int src_width);

+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,

+                          uint16_t* dst_ptr,

+                          int src_width);

-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                           int dst_width, int x, int dx);

-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                       int dst_width, int x, int dx);

+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,

+                           const uint8_t* src_ptr,

+                           int dst_width,

+                           int x,

+                           int dx);

+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,

+                       const uint8_t* src_ptr,

+                       int dst_width,

+                       int x,

+                       int dx);

 // ARGB Column functions

-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int x, int dx);

-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

-                               int dst_width, int x, int dx);

-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

-                           int dst_width, int x, int dx);

-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,

-                              int dst_width, int x, int dx);

-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int x, int dx);

-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,

-                                  int dst_width, int x, int dx);

-void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,

-                            int dst_width, int x, int dx);

+void ScaleARGBCols_SSE2(uint8_t* dst_argb,

+                        const uint8_t* src_argb,

+                        int dst_width,

+                        int x,

+                        int dx);

+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,

+                               const uint8_t* src_argb,

+                               int dst_width,

+                               int x,

+                               int dx);

+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,

+                           const uint8_t* src_argb,

+                           int dst_width,

+                           int x,

+                           int dx);

+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,

+                              const uint8_t* src_argb,

+                              int dst_width,

+                              int x,

+                              int dx);

+void ScaleARGBCols_NEON(uint8_t* dst_argb,

+                        const uint8_t* src_argb,

+                        int dst_width,

+                        int x,

+                        int dx);

+void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,

+                                  const uint8_t* src_ptr,

+                                  int dst_width,

+                                  int x,

+                                  int dx);

+void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,

+                            const uint8_t* src_ptr,

+                            int dst_width,

+                            int x,

+                            int dx);

+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,

+                             const uint8_t* src_argb,

+                             int dst_width,

+                             int x,

+                             int dx);

+void ScaleARGBCols_MSA(uint8_t* dst_argb,

+                       const uint8_t* src_argb,

+                       int dst_width,

+                       int x,

+                       int dx);

+void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,

+                                 const uint8_t* src_ptr,

+                                 int dst_width,

+                                 int x,

+                                 int dx);

+void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,

+                           const uint8_t* src_ptr,

+                           int dst_width,

+                           int x,

+                           int dx);

 // ARGB Row functions

-void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                            uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                                  uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                               uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width);

-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,

-                                  uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width);

-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                                uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,

+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_argb,

+                            int dst_width);

+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_argb,

+                                  int dst_width);

+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_argb,

+                               int dst_width);

+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst,

+                            int dst_width);

+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_argb,

+                                  int dst_width);

+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst,

+                               int dst_width);

+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_argb,

+                           int dst_width);

+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,

+                                 ptrdiff_t src_stride,

+                                 uint8_t* dst_argb,

+                                 int dst_width);

+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_argb,

+                              int dst_width);

+void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,

                                       ptrdiff_t src_stride,

-                                      uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                                   uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* dst, int dst_width);

-void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,

+                                      uint8_t* dst_ptr,

+                                      int dst_width);

+void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

+void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,

+                                ptrdiff_t src_stride,

+                                uint8_t* dst_ptr,

+                                int dst_width);

+void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,

                                       ptrdiff_t src_stride,

-                                      uint8* dst_argb, int dst_width);

-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                                   uint8* dst, int dst_width);

+                                      uint8_t* dst_ptr,

+                                      int dst_width);

+void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,

+                                     ptrdiff_t src_stride,

+                                     uint8_t* dst_ptr,

+                                     int dst_width);

+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                               int src_stepx, uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,

+                               ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8_t* dst_argb,

+                               int dst_width);

+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

                                   int src_stepx,

-                                  uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,

+                                  uint8_t* dst_argb,

+                                  int dst_width);

+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,

+                               ptrdiff_t src_stride,

                                int src_stepx,

-                               uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,

+                               uint8_t* dst_argb,

+                               int dst_width);

+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

                                   int src_stepx,

-                                  uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

+                                  uint8_t* dst_argb,

+                                  int dst_width);

+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,

+                              ptrdiff_t src_stride,

+                              int32_t src_stepx,

+                              uint8_t* dst_argb,

+                              int dst_width);

+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,

+                                 ptrdiff_t src_stride,

+                                 int src_stepx,

+                                 uint8_t* dst_argb,

+                                 int dst_width);

+void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

                                    int src_stepx,

-                                   uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,

                                       ptrdiff_t src_stride,

                                       int src_stepx,

-                                      uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,

+                                      uint8_t* dst_ptr,

+                                      int dst_width);

+void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

                                    int src_stepx,

-                                   uint8* dst_argb, int dst_width);

-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,

                                       ptrdiff_t src_stride,

                                       int src_stepx,

-                                      uint8* dst_argb, int dst_width);

+                                      uint8_t* dst_ptr,

+                                      int dst_width);

+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  int32_t src_stepx,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,

+                                     ptrdiff_t src_stride,

+                                     int src_stepx,

+                                     uint8_t* dst_ptr,

+                                     int dst_width);

 // ScaleRowDown2Box also used by planar functions

 // NEON downscalers with interpolation.

 // Note - not static due to reuse in convert for 444 to 420.

-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst, int dst_width);

-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst, int dst_width);

-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst, int dst_width);

+void ScaleRowDown2_NEON(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width);

+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst,

+                              int dst_width);

+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst,

+                           int dst_width);

-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

+void ScaleRowDown4_NEON(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst_ptr,

+                        int dst_width);

+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width);

 // Down scale from 4 to 3 pixels. Use the neon multilane read/write

 //  to load up the every 4th pixel into a 4 different registers.

 // Point samples 32 pixels to 24 pixels.

-void ScaleRowDown34_NEON(const uint8* src_ptr,

+void ScaleRowDown34_NEON(const uint8_t* src_ptr,

                          ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,

+                         uint8_t* dst_ptr,

+                         int dst_width);

+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

+                               uint8_t* dst_ptr,

+                               int dst_width);

 // 32 -> 12

-void ScaleRowDown38_NEON(const uint8* src_ptr,

+void ScaleRowDown38_NEON(const uint8_t* src_ptr,

                          ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width);

+                         uint8_t* dst_ptr,

+                         int dst_width);

 // 32x3 -> 12x1

-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

+                               uint8_t* dst_ptr,

+                               int dst_width);

 // 32x2 -> 12x1

-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

+                               uint8_t* dst_ptr,

+                               int dst_width);

-void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width);

-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                                  uint8* dst, int dst_width);

-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width);

-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width);

-void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width);

-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                             uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                                   uint8* dst_ptr, int dst_width);

-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                                   uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

+void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width);

+void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,

+                             ptrdiff_t src_stride,

+                             uint8_t* dst_ptr,

+                             int dst_width);

+void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

+void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

 // 32 -> 12

-void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                             uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,

+                             ptrdiff_t src_stride,

+                             uint8_t* dst_ptr,

+                             int dst_width);

 // 32x3 -> 12x1

-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

 // 32x2 -> 12x1

-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width);

+void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,

+                                   ptrdiff_t src_stride,

+                                   uint8_t* dst_ptr,

+                                   int dst_width);

-void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);

-void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);

+void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);

+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,

+                          uint16_t* dst_ptr,

+                          int src_width);

-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,

-                          int dst_width, int x, int dx);

+void ScaleFilterCols_NEON(uint8_t* dst_ptr,

+                          const uint8_t* src_ptr,

+                          int dst_width,

+                          int x,

+                          int dx);

-void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,

-                              int dst_width, int x, int dx);

+void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,

+                              const uint8_t* src_ptr,

+                              int dst_width,

+                              int x,

+                              int dx);

-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst, int dst_width);

-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width);

-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst, int dst_width);

-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width);

-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst, int dst_width);

-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* d, int dst_width);

-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* d, int dst_width);

-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst, int dst_width);

-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width);

+void ScaleRowDown2_MSA(const uint8_t* src_ptr,

+                       ptrdiff_t src_stride,

+                       uint8_t* dst,

+                       int dst_width);

+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,

+                             ptrdiff_t src_stride,

+                             uint8_t* dst,

+                             int dst_width);

+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst,

+                          int dst_width);

+void ScaleRowDown4_MSA(const uint8_t* src_ptr,

+                       ptrdiff_t src_stride,

+                       uint8_t* dst,

+                       int dst_width);

+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst,

+                          int dst_width);

+void ScaleRowDown38_MSA(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width);

+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width);

+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width);

+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);

+void ScaleFilterCols_MSA(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         int dst_width,

+                         int x,

+                         int dx);

+void ScaleRowDown34_MSA(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width);

+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* d,

+                              int dst_width);

+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* d,

+                              int dst_width);

+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width);

+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,

+                                 ptrdiff_t src_stride,

+                                 uint8_t* dst_ptr,

+                                 int dst_width);

+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width);

+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width);

+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width);

+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,

+                         uint16_t* dst_ptr,

+                         int src_width);

+void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,

+                             const uint8_t* src_ptr,

+                             int dst_width,

+                             int x,

+                             int dx);

+void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width);

+void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

+void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_ptr,

+                                  int dst_width);

 #ifdef __cplusplus

 }  // extern "C"

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_

--- a/third_party/libyuv/include/libyuv/version.h

+++ b/third_party/libyuv/include/libyuv/version.h

@@ -8,9 +8,9 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_VERSION_H_

 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1616

+#define LIBYUV_VERSION 1711

-#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/third_party/libyuv/include/libyuv/video_common.h

+++ b/third_party/libyuv/include/libyuv/video_common.h

@@ -10,7 +10,7 @@

 // Common definitions for video, including fourcc and VideoFormat.

-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT

+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_

 #define INCLUDE_LIBYUV_VIDEO_COMMON_H_

 #include "libyuv/basic_types.h"

@@ -28,13 +28,13 @@

 // Needs to be a macro otherwise the OS X compiler complains when the kFormat*

 // constants are used in a switch.

 #ifdef __cplusplus

-#define FOURCC(a, b, c, d) ( \

-    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \

-    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))

+#define FOURCC(a, b, c, d)                                        \

+  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \

+   (static_cast<uint32_t>(c) << 16) | (static_cast<uint32_t>(d) << 24))

 #else

-#define FOURCC(a, b, c, d) ( \

-    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \

-    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */

+#define FOURCC(a, b, c, d)                                     \

+  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \

+   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */

 #endif

 // Some pages discussing FourCC codes:

@@ -53,38 +53,33 @@

   FOURCC_I420 = FOURCC('I', '4', '2', '0'),

   FOURCC_I422 = FOURCC('I', '4', '2', '2'),

   FOURCC_I444 = FOURCC('I', '4', '4', '4'),

-  FOURCC_I411 = FOURCC('I', '4', '1', '1'),

   FOURCC_I400 = FOURCC('I', '4', '0', '0'),

   FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),

   FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),

   FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),

   FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),

+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb

-  // 2 Secondary YUV formats: row biplanar.

+  // 1 Secondary YUV format: row biplanar.

   FOURCC_M420 = FOURCC('M', '4', '2', '0'),

-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.

-  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.

+  // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc

   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),

   FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),

   FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),

+  FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.

+  FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit

   FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),

-  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),

+  FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),

   FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),

   FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.

   FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.

   FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.

-  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.

-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),

-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),

-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),

-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),

   // 1 Primary Compressed YUV format.

   FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),

-  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.

+  // 7 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.

   FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),

   FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),

   FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),

@@ -112,7 +107,13 @@

   FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.

   FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.

-  // 1 Auxiliary compressed YUV format set aside for capturer.

+  // deprecated formats.  Not supported, but defined for backward compatibility.

+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),

+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),

+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),

+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),

+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),

+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),

   FOURCC_H264 = FOURCC('H', '2', '6', '4'),

   // Match any fourcc.

@@ -136,8 +137,10 @@

   FOURCC_BPP_BGRA = 32,

   FOURCC_BPP_ABGR = 32,

   FOURCC_BPP_RGBA = 32,

+  FOURCC_BPP_AR30 = 32,

+  FOURCC_BPP_AB30 = 32,

   FOURCC_BPP_24BG = 24,

-  FOURCC_BPP_RAW  = 24,

+  FOURCC_BPP_RAW = 24,

   FOURCC_BPP_RGBP = 16,

   FOURCC_BPP_RGBO = 16,

   FOURCC_BPP_R444 = 16,

@@ -152,6 +155,7 @@

   FOURCC_BPP_J420 = 12,

   FOURCC_BPP_J400 = 8,

   FOURCC_BPP_H420 = 12,

+  FOURCC_BPP_H010 = 24,

   FOURCC_BPP_MJPG = 0,  // 0 means unknown.

   FOURCC_BPP_H264 = 0,

   FOURCC_BPP_IYUV = 12,

@@ -170,11 +174,11 @@

   FOURCC_BPP_CM24 = 24,

   // Match any fourcc.

-  FOURCC_BPP_ANY  = 0,  // 0 means unknown.

+  FOURCC_BPP_ANY = 0,  // 0 means unknown.

};

 // Converts fourcc aliases into canonical ones.

-LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);

+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);

 #ifdef __cplusplus

 }  // extern "C"

@@ -181,4 +185,4 @@

 }  // namespace libyuv

 #endif

-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT

+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_

--- a/third_party/libyuv/source/compare.cc

+++ b/third_party/libyuv/source/compare.cc

@@ -29,10 +29,10 @@

 // hash seed of 5381 recommended.

 LIBYUV_API

-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {

+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {

   const int kBlockSize = 1 << 15;  // 32768;

   int remainder;

-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =

+  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =

       HashDjb2_C;

 #if defined(HAS_HASHDJB2_SSE41)

   if (TestCpuFlag(kCpuHasSSE41)) {

@@ -45,18 +45,18 @@

 #endif

-  while (count >= (uint64)(kBlockSize)) {

+  while (count >= (uint64_t)(kBlockSize)) {

     seed = HashDjb2_SSE(src, kBlockSize, seed);

     src += kBlockSize;

     count -= kBlockSize;

-  remainder = (int)(count) & ~15;

+  remainder = (int)count & ~15;

   if (remainder) {

     seed = HashDjb2_SSE(src, remainder, seed);

     src += remainder;

     count -= remainder;

-  remainder = (int)(count) & 15;

+  remainder = (int)count & 15;

   if (remainder) {

     seed = HashDjb2_C(src, remainder, seed);

@@ -63,7 +63,7 @@

   return seed;

-static uint32 ARGBDetectRow_C(const uint8* argb, int width) {

+static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.

@@ -94,8 +94,11 @@

 // Scan an opaque argb image and return fourcc based on alpha offset.

 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.

 LIBYUV_API

-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {

-  uint32 fourcc = 0;

+uint32_t ARGBDetect(const uint8_t* argb,

+                    int stride_argb,

+                    int width,

+                    int height) {

+  uint32_t fourcc = 0;

   int h;

   // Coalesce rows.

@@ -111,19 +114,80 @@

   return fourcc;

+// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.

+// So actual maximum is 1 less loop, which is 64436 - 32 bytes.

+LIBYUV_API

+uint64_t ComputeHammingDistance(const uint8_t* src_a,

+                                const uint8_t* src_b,

+                                int count) {

+  const int kBlockSize = 1 << 15;  // 32768;

+  const int kSimdSize = 64;

+  // SIMD for multiple of 64, and C for remainder

+  int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);

+  uint64_t diff = 0;

+  int i;

+  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,

+                              int count) = HammingDistance_C;

+#if defined(HAS_HAMMINGDISTANCE_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    HammingDistance = HammingDistance_NEON;

+  }

+#endif

+#if defined(HAS_HAMMINGDISTANCE_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    HammingDistance = HammingDistance_SSSE3;

+  }

+#endif

+#if defined(HAS_HAMMINGDISTANCE_SSE42)

+  if (TestCpuFlag(kCpuHasSSE42)) {

+    HammingDistance = HammingDistance_SSE42;

+  }

+#endif

+#if defined(HAS_HAMMINGDISTANCE_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    HammingDistance = HammingDistance_AVX2;

+  }

+#endif

+#if defined(HAS_HAMMINGDISTANCE_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    HammingDistance = HammingDistance_MSA;

+  }

+#endif

+#ifdef _OPENMP

+#pragma omp parallel for reduction(+ : diff)

+#endif

+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {

+    diff += HammingDistance(src_a + i, src_b + i, kBlockSize);

+  }

+  src_a += count & ~(kBlockSize - 1);

+  src_b += count & ~(kBlockSize - 1);

+  if (remainder) {

+    diff += HammingDistance(src_a, src_b, remainder);

+    src_a += remainder;

+    src_b += remainder;

+  }

+  remainder = count & (kSimdSize - 1);

+  if (remainder) {

+    diff += HammingDistance_C(src_a, src_b, remainder);

+  }

+  return diff;

+}

 // TODO(fbarchard): Refactor into row function.

 LIBYUV_API

-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,

-                             int count) {

+uint64_t ComputeSumSquareError(const uint8_t* src_a,

+                               const uint8_t* src_b,

+                               int count) {

   // SumSquareError returns values 0 to 65535 for each squared difference.

-  // Up to 65536 of those can be summed and remain within a uint32.

-  // After each block of 65536 pixels, accumulate into a uint64.

+  // Up to 65536 of those can be summed and remain within a uint32_t.

+  // After each block of 65536 pixels, accumulate into a uint64_t.

   const int kBlockSize = 65536;

   int remainder = count & (kBlockSize - 1) & ~31;

-  uint64 sse = 0;

+  uint64_t sse = 0;

   int i;

-  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =

-      SumSquareError_C;

+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,

+                             int count) = SumSquareError_C;

 #if defined(HAS_SUMSQUAREERROR_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     SumSquareError = SumSquareError_NEON;

@@ -141,8 +205,13 @@

     SumSquareError = SumSquareError_AVX2;

 #endif

+#if defined(HAS_SUMSQUAREERROR_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SumSquareError = SumSquareError_MSA;

+  }

+#endif

 #ifdef _OPENMP

-#pragma omp parallel for reduction(+: sse)

+#pragma omp parallel for reduction(+ : sse)

 #endif

   for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {

     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);

@@ -162,14 +231,16 @@

 LIBYUV_API

-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,

-                                  const uint8* src_b, int stride_b,

-                                  int width, int height) {

-  uint64 sse = 0;

+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,

+                                    int stride_a,

+                                    const uint8_t* src_b,

+                                    int stride_b,

+                                    int width,

+                                    int height) {

+  uint64_t sse = 0;

   int h;

   // Coalesce rows.

-  if (stride_a == width &&

-      stride_b == width) {

+  if (stride_a == width && stride_b == width) {

     width *= height;

     height = 1;

     stride_a = stride_b = 0;

@@ -183,66 +254,76 @@

 LIBYUV_API

-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {

+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {

   double psnr;

   if (sse > 0) {

-    double mse = (double)(count) / (double)(sse);

+    double mse = (double)count / (double)sse;

     psnr = 10.0 * log10(255.0 * 255.0 * mse);

   } else {

-    psnr = kMaxPsnr;      // Limit to prevent divide by 0

+    psnr = kMaxPsnr;  // Limit to prevent divide by 0

-  if (psnr > kMaxPsnr)

+  if (psnr > kMaxPsnr) {

     psnr = kMaxPsnr;

+  }

   return psnr;

 LIBYUV_API

-double CalcFramePsnr(const uint8* src_a, int stride_a,

-                     const uint8* src_b, int stride_b,

-                     int width, int height) {

-  const uint64 samples = width * height;

-  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,

-                                                src_b, stride_b,

-                                                width, height);

+double CalcFramePsnr(const uint8_t* src_a,

+                     int stride_a,

+                     const uint8_t* src_b,

+                     int stride_b,

+                     int width,

+                     int height) {

+  const uint64_t samples = (uint64_t)width * (uint64_t)height;

+  const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,

+                                                  stride_b, width, height);

   return SumSquareErrorToPsnr(sse, samples);

 LIBYUV_API

-double I420Psnr(const uint8* src_y_a, int stride_y_a,

-                const uint8* src_u_a, int stride_u_a,

-                const uint8* src_v_a, int stride_v_a,

-                const uint8* src_y_b, int stride_y_b,

-                const uint8* src_u_b, int stride_u_b,

-                const uint8* src_v_b, int stride_v_b,

-                int width, int height) {

-  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,

-                                                  src_y_b, stride_y_b,

-                                                  width, height);

+double I420Psnr(const uint8_t* src_y_a,

+                int stride_y_a,

+                const uint8_t* src_u_a,

+                int stride_u_a,

+                const uint8_t* src_v_a,

+                int stride_v_a,

+                const uint8_t* src_y_b,

+                int stride_y_b,

+                const uint8_t* src_u_b,

+                int stride_u_b,

+                const uint8_t* src_v_b,

+                int stride_v_b,

+                int width,

+                int height) {

+  const uint64_t sse_y = ComputeSumSquareErrorPlane(

+      src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);

   const int width_uv = (width + 1) >> 1;

   const int height_uv = (height + 1) >> 1;

-  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,

-                                                  src_u_b, stride_u_b,

-                                                  width_uv, height_uv);

-  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,

-                                                  src_v_b, stride_v_b,

-                                                  width_uv, height_uv);

-  const uint64 samples = width * height + 2 * (width_uv * height_uv);

-  const uint64 sse = sse_y + sse_u + sse_v;

+  const uint64_t sse_u = ComputeSumSquareErrorPlane(

+      src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);

+  const uint64_t sse_v = ComputeSumSquareErrorPlane(

+      src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);

+  const uint64_t samples = (uint64_t)width * (uint64_t)height +

+                           2 * ((uint64_t)width_uv * (uint64_t)height_uv);

+  const uint64_t sse = sse_y + sse_u + sse_v;

   return SumSquareErrorToPsnr(sse, samples);

-static const int64 cc1 =  26634;  // (64^2*(.01*255)^2

-static const int64 cc2 = 239708;  // (64^2*(.03*255)^2

+static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2

+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2

-static double Ssim8x8_C(const uint8* src_a, int stride_a,

-                        const uint8* src_b, int stride_b) {

-  int64 sum_a = 0;

-  int64 sum_b = 0;

-  int64 sum_sq_a = 0;

-  int64 sum_sq_b = 0;

-  int64 sum_axb = 0;

+static double Ssim8x8_C(const uint8_t* src_a,

+                        int stride_a,

+                        const uint8_t* src_b,

+                        int stride_b) {

+  int64_t sum_a = 0;

+  int64_t sum_b = 0;

+  int64_t sum_sq_a = 0;

+  int64_t sum_sq_b = 0;

+  int64_t sum_axb = 0;

   int i;

   for (i = 0; i < 8; ++i) {

@@ -260,22 +341,22 @@

-    const int64 count = 64;

+    const int64_t count = 64;

     // scale the constants by number of pixels

-    const int64 c1 = (cc1 * count * count) >> 12;

-    const int64 c2 = (cc2 * count * count) >> 12;

+    const int64_t c1 = (cc1 * count * count) >> 12;

+    const int64_t c2 = (cc2 * count * count) >> 12;

-    const int64 sum_a_x_sum_b = sum_a * sum_b;

+    const int64_t sum_a_x_sum_b = sum_a * sum_b;

-    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *

-                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);

+    const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *

+                           (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);

-    const int64 sum_a_sq = sum_a*sum_a;

-    const int64 sum_b_sq = sum_b*sum_b;

+    const int64_t sum_a_sq = sum_a * sum_a;

+    const int64_t sum_b_sq = sum_b * sum_b;

-    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *

-                         (count * sum_sq_a - sum_a_sq +

-                          count * sum_sq_b - sum_b_sq + c2);

+    const int64_t ssim_d =

+        (sum_a_sq + sum_b_sq + c1) *

+        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);

     if (ssim_d == 0.0) {

       return DBL_MAX;

@@ -288,13 +369,16 @@

 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap

 // block boundaries to penalize blocking artifacts.

 LIBYUV_API

-double CalcFrameSsim(const uint8* src_a, int stride_a,

-                     const uint8* src_b, int stride_b,

-                     int width, int height) {

+double CalcFrameSsim(const uint8_t* src_a,

+                     int stride_a,

+                     const uint8_t* src_b,

+                     int stride_b,

+                     int width,

+                     int height) {

   int samples = 0;

   double ssim_total = 0;

-  double (*Ssim8x8)(const uint8* src_a, int stride_a,

-                    const uint8* src_b, int stride_b) = Ssim8x8_C;

+  double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,

+                    int stride_b) = Ssim8x8_C;

   // sample point start with each 4x4 location

   int i;

@@ -314,22 +398,27 @@

 LIBYUV_API

-double I420Ssim(const uint8* src_y_a, int stride_y_a,

-                const uint8* src_u_a, int stride_u_a,

-                const uint8* src_v_a, int stride_v_a,

-                const uint8* src_y_b, int stride_y_b,

-                const uint8* src_u_b, int stride_u_b,

-                const uint8* src_v_b, int stride_v_b,

-                int width, int height) {

-  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,

-                                      src_y_b, stride_y_b, width, height);

+double I420Ssim(const uint8_t* src_y_a,

+                int stride_y_a,

+                const uint8_t* src_u_a,

+                int stride_u_a,

+                const uint8_t* src_v_a,

+                int stride_v_a,

+                const uint8_t* src_y_b,

+                int stride_y_b,

+                const uint8_t* src_u_b,

+                int stride_u_b,

+                const uint8_t* src_v_b,

+                int stride_v_b,

+                int width,

+                int height) {

+  const double ssim_y =

+      CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);

   const int width_uv = (width + 1) >> 1;

   const int height_uv = (height + 1) >> 1;

-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,

-                                      src_u_b, stride_u_b,

+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,

                                       width_uv, height_uv);

-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,

-                                      src_v_b, stride_v_b,

+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,

                                       width_uv, height_uv);

   return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);

--- a/third_party/libyuv/source/compare_common.cc

+++ b/third_party/libyuv/source/compare_common.cc

@@ -17,12 +17,72 @@

 extern "C" {

 #endif

-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {

-  uint32 sse = 0u;

+#if ORIGINAL_OPT

+uint32_t HammingDistance_C1(const uint8_t* src_a,

+                            const uint8_t* src_b,

+                            int count) {

+  uint32_t diff = 0u;

   int i;

   for (i = 0; i < count; ++i) {

+    int x = src_a[i] ^ src_b[i];

+    if (x & 1)

+      ++diff;

+    if (x & 2)

+      ++diff;

+    if (x & 4)

+      ++diff;

+    if (x & 8)

+      ++diff;

+    if (x & 16)

+      ++diff;

+    if (x & 32)

+      ++diff;

+    if (x & 64)

+      ++diff;

+    if (x & 128)

+      ++diff;

+  }

+  return diff;

+}

+#endif

+// Hakmem method for hamming distance.

+uint32_t HammingDistance_C(const uint8_t* src_a,

+                           const uint8_t* src_b,

+                           int count) {

+  uint32_t diff = 0u;

+  int i;

+  for (i = 0; i < count - 3; i += 4) {

+    uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);

+    uint32_t u = x - ((x >> 1) & 0x55555555);

+    u = ((u >> 2) & 0x33333333) + (u & 0x33333333);

+    diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);

+    src_a += 4;

+    src_b += 4;

+  }

+  for (; i < count; ++i) {

+    uint32_t x = *src_a ^ *src_b;

+    uint32_t u = x - ((x >> 1) & 0x55);

+    u = ((u >> 2) & 0x33) + (u & 0x33);

+    diff += (u + (u >> 4)) & 0x0f;

+    src_a += 1;

+    src_b += 1;

+  }

+  return diff;

+}

+uint32_t SumSquareError_C(const uint8_t* src_a,

+                          const uint8_t* src_b,

+                          int count) {

+  uint32_t sse = 0u;

+  int i;

+  for (i = 0; i < count; ++i) {

     int diff = src_a[i] - src_b[i];

-    sse += (uint32)(diff * diff);

+    sse += (uint32_t)(diff * diff);

   return sse;

@@ -29,8 +89,8 @@

 // hash seed of 5381 recommended.

 // Internal C version of HashDjb2 with int sized count for efficiency.

-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {

-  uint32 hash = seed;

+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {

+  uint32_t hash = seed;

   int i;

   for (i = 0; i < count; ++i) {

     hash += (hash << 5) + src[i];

--- a/third_party/libyuv/source/compare_gcc.cc

+++ b/third_party/libyuv/source/compare_gcc.cc

@@ -22,124 +22,334 @@

 #if !defined(LIBYUV_DISABLE_X86) && \

     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {

-  uint32 sse;

-  asm volatile (

-    "pxor      %%xmm0,%%xmm0                   \n"

-    "pxor      %%xmm5,%%xmm5                   \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x10, 0) ",%0          \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x10, 1) ",%1          \n"

-    "movdqa    %%xmm1,%%xmm3                   \n"

-    "psubusb   %%xmm2,%%xmm1                   \n"

-    "psubusb   %%xmm3,%%xmm2                   \n"

-    "por       %%xmm2,%%xmm1                   \n"

-    "movdqa    %%xmm1,%%xmm2                   \n"

-    "punpcklbw %%xmm5,%%xmm1                   \n"

-    "punpckhbw %%xmm5,%%xmm2                   \n"

-    "pmaddwd   %%xmm1,%%xmm1                   \n"

-    "pmaddwd   %%xmm2,%%xmm2                   \n"

-    "paddd     %%xmm1,%%xmm0                   \n"

-    "paddd     %%xmm2,%%xmm0                   \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

+#if defined(__x86_64__)

+uint32_t HammingDistance_SSE42(const uint8_t* src_a,

+                               const uint8_t* src_b,

+                               int count) {

+  uint64_t diff = 0u;

-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"

-    "paddd     %%xmm1,%%xmm0                   \n"

-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"

-    "paddd     %%xmm1,%%xmm0                   \n"

-    "movd      %%xmm0,%3                       \n"

+  asm volatile(

+      "xor        %3,%3                          \n"

+      "xor        %%r8,%%r8                      \n"

+      "xor        %%r9,%%r9                      \n"

+      "xor        %%r10,%%r10                    \n"

-  : "+r"(src_a),      // %0

-    "+r"(src_b),      // %1

-    "+r"(count),      // %2

-    "=g"(sse)         // %3

-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      // Process 32 bytes per loop.

+      LABELALIGN

+      "1:                                        \n"

+      "mov        (%0),%%rcx                     \n"

+      "mov        0x8(%0),%%rdx                  \n"

+      "xor        (%1),%%rcx                     \n"

+      "xor        0x8(%1),%%rdx                  \n"

+      "popcnt     %%rcx,%%rcx                    \n"

+      "popcnt     %%rdx,%%rdx                    \n"

+      "mov        0x10(%0),%%rsi                 \n"

+      "mov        0x18(%0),%%rdi                 \n"

+      "xor        0x10(%1),%%rsi                 \n"

+      "xor        0x18(%1),%%rdi                 \n"

+      "popcnt     %%rsi,%%rsi                    \n"

+      "popcnt     %%rdi,%%rdi                    \n"

+      "add        $0x20,%0                       \n"

+      "add        $0x20,%1                       \n"

+      "add        %%rcx,%3                       \n"

+      "add        %%rdx,%%r8                     \n"

+      "add        %%rsi,%%r9                     \n"

+      "add        %%rdi,%%r10                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "add        %%r8, %3                       \n"

+      "add        %%r9, %3                       \n"

+      "add        %%r10, %3                      \n"

+      : "+r"(src_a),  // %0

+        "+r"(src_b),  // %1

+        "+r"(count),  // %2

+        "=r"(diff)    // %3

+      :

+      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");

+  return static_cast<uint32_t>(diff);

+}

+#else

+uint32_t HammingDistance_SSE42(const uint8_t* src_a,

+                               const uint8_t* src_b,

+                               int count) {

+  uint32_t diff = 0u;

+  asm volatile(

+      // Process 16 bytes per loop.

+      LABELALIGN

+      "1:                                        \n"

+      "mov        (%0),%%ecx                     \n"

+      "mov        0x4(%0),%%edx                  \n"

+      "xor        (%1),%%ecx                     \n"

+      "xor        0x4(%1),%%edx                  \n"

+      "popcnt     %%ecx,%%ecx                    \n"

+      "add        %%ecx,%3                       \n"

+      "popcnt     %%edx,%%edx                    \n"

+      "add        %%edx,%3                       \n"

+      "mov        0x8(%0),%%ecx                  \n"

+      "mov        0xc(%0),%%edx                  \n"

+      "xor        0x8(%1),%%ecx                  \n"

+      "xor        0xc(%1),%%edx                  \n"

+      "popcnt     %%ecx,%%ecx                    \n"

+      "add        %%ecx,%3                       \n"

+      "popcnt     %%edx,%%edx                    \n"

+      "add        %%edx,%3                       \n"

+      "add        $0x10,%0                       \n"

+      "add        $0x10,%1                       \n"

+      "sub        $0x10,%2                       \n"

+      "jg         1b                             \n"

+      : "+r"(src_a),  // %0

+        "+r"(src_b),  // %1

+        "+r"(count),  // %2

+        "+r"(diff)    // %3

+      :

+      : "memory", "cc", "ecx", "edx");

+  return diff;

+}

+#endif

+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,

+                                 15, 15, 15, 15, 15, 15, 15, 15};

+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};

+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,

+                               const uint8_t* src_b,

+                               int count) {

+  uint32_t diff = 0u;

+  asm volatile(

+      "movdqa     %4,%%xmm2                      \n"

+      "movdqa     %5,%%xmm3                      \n"

+      "pxor       %%xmm0,%%xmm0                  \n"

+      "pxor       %%xmm1,%%xmm1                  \n"

+      "sub        %0,%1                          \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqa     (%0),%%xmm4                    \n"

+      "movdqa     0x10(%0), %%xmm5               \n"

+      "pxor       (%0,%1), %%xmm4                \n"

+      "movdqa     %%xmm4,%%xmm6                  \n"

+      "pand       %%xmm2,%%xmm6                  \n"

+      "psrlw      $0x4,%%xmm4                    \n"

+      "movdqa     %%xmm3,%%xmm7                  \n"

+      "pshufb     %%xmm6,%%xmm7                  \n"

+      "pand       %%xmm2,%%xmm4                  \n"

+      "movdqa     %%xmm3,%%xmm6                  \n"

+      "pshufb     %%xmm4,%%xmm6                  \n"

+      "paddb      %%xmm7,%%xmm6                  \n"

+      "pxor       0x10(%0,%1),%%xmm5             \n"

+      "add        $0x20,%0                       \n"

+      "movdqa     %%xmm5,%%xmm4                  \n"

+      "pand       %%xmm2,%%xmm5                  \n"

+      "psrlw      $0x4,%%xmm4                    \n"

+      "movdqa     %%xmm3,%%xmm7                  \n"

+      "pshufb     %%xmm5,%%xmm7                  \n"

+      "pand       %%xmm2,%%xmm4                  \n"

+      "movdqa     %%xmm3,%%xmm5                  \n"

+      "pshufb     %%xmm4,%%xmm5                  \n"

+      "paddb      %%xmm7,%%xmm5                  \n"

+      "paddb      %%xmm5,%%xmm6                  \n"

+      "psadbw     %%xmm1,%%xmm6                  \n"

+      "paddd      %%xmm6,%%xmm0                  \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "pshufd     $0xaa,%%xmm0,%%xmm1            \n"

+      "paddd      %%xmm1,%%xmm0                  \n"

+      "movd       %%xmm0, %3                     \n"

+      : "+r"(src_a),       // %0

+        "+r"(src_b),       // %1

+        "+r"(count),       // %2

+        "=r"(diff)         // %3

+      : "m"(kNibbleMask),  // %4

+        "m"(kBitCount)     // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

+  return diff;

+}

+#ifdef HAS_HAMMINGDISTANCE_AVX2

+uint32_t HammingDistance_AVX2(const uint8_t* src_a,

+                              const uint8_t* src_b,

+                              int count) {

+  uint32_t diff = 0u;

+  asm volatile(

+      "vbroadcastf128 %4,%%ymm2                  \n"

+      "vbroadcastf128 %5,%%ymm3                  \n"

+      "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"

+      "vpxor      %%ymm1,%%ymm1,%%ymm1           \n"

+      "sub        %0,%1                          \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqa    (%0),%%ymm4                    \n"

+      "vmovdqa    0x20(%0), %%ymm5               \n"

+      "vpxor      (%0,%1), %%ymm4, %%ymm4        \n"

+      "vpand      %%ymm2,%%ymm4,%%ymm6           \n"

+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"

+      "vpshufb    %%ymm6,%%ymm3,%%ymm6           \n"

+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"

+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"

+      "vpaddb     %%ymm4,%%ymm6,%%ymm6           \n"

+      "vpxor      0x20(%0,%1),%%ymm5,%%ymm4      \n"

+      "add        $0x40,%0                       \n"

+      "vpand      %%ymm2,%%ymm4,%%ymm5           \n"

+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"

+      "vpshufb    %%ymm5,%%ymm3,%%ymm5           \n"

+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"

+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"

+      "vpaddb     %%ymm5,%%ymm4,%%ymm4           \n"

+      "vpaddb     %%ymm6,%%ymm4,%%ymm4           \n"

+      "vpsadbw    %%ymm1,%%ymm4,%%ymm4           \n"

+      "vpaddd     %%ymm0,%%ymm4,%%ymm0           \n"

+      "sub        $0x40,%2                       \n"

+      "jg         1b                             \n"

+      "vpermq     $0xb1,%%ymm0,%%ymm1            \n"

+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xaa,%%ymm0,%%ymm1            \n"

+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"

+      "vmovd      %%xmm0, %3                     \n"

+      "vzeroupper                                \n"

+      : "+r"(src_a),       // %0

+        "+r"(src_b),       // %1

+        "+r"(count),       // %2

+        "=r"(diff)         // %3

+      : "m"(kNibbleMask),  // %4

+        "m"(kBitCount)     // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

+  return diff;

+}

+#endif  // HAS_HAMMINGDISTANCE_AVX2

+uint32_t SumSquareError_SSE2(const uint8_t* src_a,

+                             const uint8_t* src_b,

+                             int count) {

+  uint32_t sse;

+  asm volatile(

+      "pxor      %%xmm0,%%xmm0                   \n"

+      "pxor      %%xmm5,%%xmm5                   \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm1                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqu    (%1),%%xmm2                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "movdqa    %%xmm1,%%xmm3                   \n"

+      "psubusb   %%xmm2,%%xmm1                   \n"

+      "psubusb   %%xmm3,%%xmm2                   \n"

+      "por       %%xmm2,%%xmm1                   \n"

+      "movdqa    %%xmm1,%%xmm2                   \n"

+      "punpcklbw %%xmm5,%%xmm1                   \n"

+      "punpckhbw %%xmm5,%%xmm2                   \n"

+      "pmaddwd   %%xmm1,%%xmm1                   \n"

+      "pmaddwd   %%xmm2,%%xmm2                   \n"

+      "paddd     %%xmm1,%%xmm0                   \n"

+      "paddd     %%xmm2,%%xmm0                   \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      "pshufd    $0xee,%%xmm0,%%xmm1             \n"

+      "paddd     %%xmm1,%%xmm0                   \n"

+      "pshufd    $0x1,%%xmm0,%%xmm1              \n"

+      "paddd     %%xmm1,%%xmm0                   \n"

+      "movd      %%xmm0,%3                       \n"

+      : "+r"(src_a),  // %0

+        "+r"(src_b),  // %1

+        "+r"(count),  // %2

+        "=g"(sse)     // %3

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

   return sse;

-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16

-static uvec32 kHashMul0 = {

-  0x0c3525e1,  // 33 ^ 15

-  0xa3476dc1,  // 33 ^ 14

-  0x3b4039a1,  // 33 ^ 13

-  0x4f5f0981,  // 33 ^ 12

+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16

+static const uvec32 kHashMul0 = {

+    0x0c3525e1,  // 33 ^ 15

+    0xa3476dc1,  // 33 ^ 14

+    0x3b4039a1,  // 33 ^ 13

+    0x4f5f0981,  // 33 ^ 12

};

-static uvec32 kHashMul1 = {

-  0x30f35d61,  // 33 ^ 11

-  0x855cb541,  // 33 ^ 10

-  0x040a9121,  // 33 ^ 9

-  0x747c7101,  // 33 ^ 8

+static const uvec32 kHashMul1 = {

+    0x30f35d61,  // 33 ^ 11

+    0x855cb541,  // 33 ^ 10

+    0x040a9121,  // 33 ^ 9

+    0x747c7101,  // 33 ^ 8

};

-static uvec32 kHashMul2 = {

-  0xec41d4e1,  // 33 ^ 7

-  0x4cfa3cc1,  // 33 ^ 6

-  0x025528a1,  // 33 ^ 5

-  0x00121881,  // 33 ^ 4

+static const uvec32 kHashMul2 = {

+    0xec41d4e1,  // 33 ^ 7

+    0x4cfa3cc1,  // 33 ^ 6

+    0x025528a1,  // 33 ^ 5

+    0x00121881,  // 33 ^ 4

};

-static uvec32 kHashMul3 = {

-  0x00008c61,  // 33 ^ 3

-  0x00000441,  // 33 ^ 2

-  0x00000021,  // 33 ^ 1

-  0x00000001,  // 33 ^ 0

+static const uvec32 kHashMul3 = {

+    0x00008c61,  // 33 ^ 3

+    0x00000441,  // 33 ^ 2

+    0x00000021,  // 33 ^ 1

+    0x00000001,  // 33 ^ 0

};

-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {

-  uint32 hash;

-  asm volatile (

-    "movd      %2,%%xmm0                       \n"

-    "pxor      %%xmm7,%%xmm7                   \n"

-    "movdqa    %4,%%xmm6                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x10, 0) ",%0          \n"

-    "pmulld    %%xmm6,%%xmm0                   \n"

-    "movdqa    %5,%%xmm5                       \n"

-    "movdqa    %%xmm1,%%xmm2                   \n"

-    "punpcklbw %%xmm7,%%xmm2                   \n"

-    "movdqa    %%xmm2,%%xmm3                   \n"

-    "punpcklwd %%xmm7,%%xmm3                   \n"

-    "pmulld    %%xmm5,%%xmm3                   \n"

-    "movdqa    %6,%%xmm5                       \n"

-    "movdqa    %%xmm2,%%xmm4                   \n"

-    "punpckhwd %%xmm7,%%xmm4                   \n"

-    "pmulld    %%xmm5,%%xmm4                   \n"

-    "movdqa    %7,%%xmm5                       \n"

-    "punpckhbw %%xmm7,%%xmm1                   \n"

-    "movdqa    %%xmm1,%%xmm2                   \n"

-    "punpcklwd %%xmm7,%%xmm2                   \n"

-    "pmulld    %%xmm5,%%xmm2                   \n"

-    "movdqa    %8,%%xmm5                       \n"

-    "punpckhwd %%xmm7,%%xmm1                   \n"

-    "pmulld    %%xmm5,%%xmm1                   \n"

-    "paddd     %%xmm4,%%xmm3                   \n"

-    "paddd     %%xmm2,%%xmm1                   \n"

-    "paddd     %%xmm3,%%xmm1                   \n"

-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"

-    "paddd     %%xmm2,%%xmm1                   \n"

-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"

-    "paddd     %%xmm2,%%xmm1                   \n"

-    "paddd     %%xmm1,%%xmm0                   \n"

-    "sub       $0x10,%1                        \n"

-    "jg        1b                              \n"

-    "movd      %%xmm0,%3                       \n"

-  : "+r"(src),        // %0

-    "+r"(count),      // %1

-    "+rm"(seed),      // %2

-    "=g"(hash)        // %3

-  : "m"(kHash16x33),  // %4

-    "m"(kHashMul0),   // %5

-    "m"(kHashMul1),   // %6

-    "m"(kHashMul2),   // %7

-    "m"(kHashMul3)    // %8

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {

+  uint32_t hash;

+  asm volatile(

+      "movd      %2,%%xmm0                       \n"

+      "pxor      %%xmm7,%%xmm7                   \n"

+      "movdqa    %4,%%xmm6                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm1                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "pmulld    %%xmm6,%%xmm0                   \n"

+      "movdqa    %5,%%xmm5                       \n"

+      "movdqa    %%xmm1,%%xmm2                   \n"

+      "punpcklbw %%xmm7,%%xmm2                   \n"

+      "movdqa    %%xmm2,%%xmm3                   \n"

+      "punpcklwd %%xmm7,%%xmm3                   \n"

+      "pmulld    %%xmm5,%%xmm3                   \n"

+      "movdqa    %6,%%xmm5                       \n"

+      "movdqa    %%xmm2,%%xmm4                   \n"

+      "punpckhwd %%xmm7,%%xmm4                   \n"

+      "pmulld    %%xmm5,%%xmm4                   \n"

+      "movdqa    %7,%%xmm5                       \n"

+      "punpckhbw %%xmm7,%%xmm1                   \n"

+      "movdqa    %%xmm1,%%xmm2                   \n"

+      "punpcklwd %%xmm7,%%xmm2                   \n"

+      "pmulld    %%xmm5,%%xmm2                   \n"

+      "movdqa    %8,%%xmm5                       \n"

+      "punpckhwd %%xmm7,%%xmm1                   \n"

+      "pmulld    %%xmm5,%%xmm1                   \n"

+      "paddd     %%xmm4,%%xmm3                   \n"

+      "paddd     %%xmm2,%%xmm1                   \n"

+      "paddd     %%xmm3,%%xmm1                   \n"

+      "pshufd    $0xe,%%xmm1,%%xmm2              \n"

+      "paddd     %%xmm2,%%xmm1                   \n"

+      "pshufd    $0x1,%%xmm1,%%xmm2              \n"

+      "paddd     %%xmm2,%%xmm1                   \n"

+      "paddd     %%xmm1,%%xmm0                   \n"

+      "sub       $0x10,%1                        \n"

+      "jg        1b                              \n"

+      "movd      %%xmm0,%3                       \n"

+      : "+r"(src),        // %0

+        "+r"(count),      // %1

+        "+rm"(seed),      // %2

+        "=g"(hash)        // %3

+      : "m"(kHash16x33),  // %4

+        "m"(kHashMul0),   // %5

+        "m"(kHashMul1),   // %6

+        "m"(kHashMul2),   // %7

+        "m"(kHashMul3)    // %8

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

   return hash;

 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))

@@ -148,4 +358,3 @@

 }  // extern "C"

 }  // namespace libyuv

 #endif

--- /dev/null

+++ b/third_party/libyuv/source/compare_msa.cc

@@ -1,0 +1,97 @@

+/*

+ *  Copyright 2017 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/basic_types.h"

+#include "libyuv/compare_row.h"

+#include "libyuv/row.h"

+// This module is for GCC MSA

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#include "libyuv/macros_msa.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+uint32_t HammingDistance_MSA(const uint8_t* src_a,

+                             const uint8_t* src_b,

+                             int count) {

+  uint32_t diff = 0u;

+  int i;

+  v16u8 src0, src1, src2, src3;

+  v2i64 vec0 = {0}, vec1 = {0};

+  for (i = 0; i < count; i += 32) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);

+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);

+    src0 ^= src2;

+    src1 ^= src3;

+    vec0 += __msa_pcnt_d((v2i64)src0);

+    vec1 += __msa_pcnt_d((v2i64)src1);

+    src_a += 32;

+    src_b += 32;

+  }

+  vec0 += vec1;

+  diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);

+  diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);

+  return diff;

+}

+uint32_t SumSquareError_MSA(const uint8_t* src_a,

+                            const uint8_t* src_b,

+                            int count) {

+  uint32_t sse = 0u;

+  int i;

+  v16u8 src0, src1, src2, src3;

+  v8i16 vec0, vec1, vec2, vec3;

+  v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};

+  v2i64 tmp0;

+  for (i = 0; i < count; i += 32) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);

+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);

+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

+    vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

+    vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

+    vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);

+    vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);

+    vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);

+    vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);

+    reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);

+    reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);

+    reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);

+    reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);

+    src_a += 32;

+    src_b += 32;

+  }

+  reg0 += reg1;

+  reg2 += reg3;

+  reg0 += reg2;

+  tmp0 = __msa_hadd_s_d(reg0, reg0);

+  sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);

+  sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);

+  return sse;

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

--- a/third_party/libyuv/source/compare_neon.cc

+++ b/third_party/libyuv/source/compare_neon.cc

@@ -21,40 +21,70 @@

 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \

     !defined(__aarch64__)

-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {

-  volatile uint32 sse;

-  asm volatile (

-    "vmov.u8    q8, #0                         \n"

-    "vmov.u8    q10, #0                        \n"

-    "vmov.u8    q9, #0                         \n"

-    "vmov.u8    q11, #0                        \n"

+// 256 bits at a time

+// uses short accumulator which restricts count to 131 KB

+uint32_t HammingDistance_NEON(const uint8_t* src_a,

+                              const uint8_t* src_b,

+                              int count) {

+  uint32_t diff;

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"

-    MEMACCESS(1)

-    "vld1.8     {q1}, [%1]!                    \n"

-    "subs       %2, %2, #16                    \n"

-    "vsubl.u8   q2, d0, d2                     \n"

-    "vsubl.u8   q3, d1, d3                     \n"

-    "vmlal.s16  q8, d4, d4                     \n"

-    "vmlal.s16  q9, d6, d6                     \n"

-    "vmlal.s16  q10, d5, d5                    \n"

-    "vmlal.s16  q11, d7, d7                    \n"

-    "bgt        1b                             \n"

+  asm volatile(

+      "vmov.u16   q4, #0                         \n"  // accumulator

-    "vadd.u32   q8, q8, q9                     \n"

-    "vadd.u32   q10, q10, q11                  \n"

-    "vadd.u32   q11, q8, q10                   \n"

-    "vpaddl.u32 q1, q11                        \n"

-    "vadd.u64   d0, d2, d3                     \n"

-    "vmov.32    %3, d0[0]                      \n"

-    : "+r"(src_a),

-      "+r"(src_b),

-      "+r"(count),

-      "=r"(sse)

-    :

-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");

+      "1:                                        \n"

+      "vld1.8     {q0, q1}, [%0]!                \n"

+      "vld1.8     {q2, q3}, [%1]!                \n"

+      "veor.32    q0, q0, q2                     \n"

+      "veor.32    q1, q1, q3                     \n"

+      "vcnt.i8    q0, q0                         \n"

+      "vcnt.i8    q1, q1                         \n"

+      "subs       %2, %2, #32                    \n"

+      "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts

+      "vpadal.u8  q4, q0                         \n"  // 8 shorts

+      "bgt        1b                             \n"

+      "vpaddl.u16 q0, q4                         \n"  // 4 ints

+      "vpadd.u32  d0, d0, d1                     \n"

+      "vpadd.u32  d0, d0, d0                     \n"

+      "vmov.32    %3, d0[0]                      \n"

+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)

+      :

+      : "cc", "q0", "q1", "q2", "q3", "q4");

+  return diff;

+}

+uint32_t SumSquareError_NEON(const uint8_t* src_a,

+                             const uint8_t* src_b,

+                             int count) {

+  uint32_t sse;

+  asm volatile(

+      "vmov.u8    q8, #0                         \n"

+      "vmov.u8    q10, #0                        \n"

+      "vmov.u8    q9, #0                         \n"

+      "vmov.u8    q11, #0                        \n"

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"

+      "vld1.8     {q1}, [%1]!                    \n"

+      "subs       %2, %2, #16                    \n"

+      "vsubl.u8   q2, d0, d2                     \n"

+      "vsubl.u8   q3, d1, d3                     \n"

+      "vmlal.s16  q8, d4, d4                     \n"

+      "vmlal.s16  q9, d6, d6                     \n"

+      "vmlal.s16  q10, d5, d5                    \n"

+      "vmlal.s16  q11, d7, d7                    \n"

+      "bgt        1b                             \n"

+      "vadd.u32   q8, q8, q9                     \n"

+      "vadd.u32   q10, q10, q11                  \n"

+      "vadd.u32   q11, q8, q10                   \n"

+      "vpaddl.u32 q1, q11                        \n"

+      "vadd.u64   d0, d2, d3                     \n"

+      "vmov.32    %3, d0[0]                      \n"

+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)

+      :

+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");

   return sse;

--- a/third_party/libyuv/source/compare_neon64.cc

+++ b/third_party/libyuv/source/compare_neon64.cc

@@ -20,39 +20,65 @@

 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {

-  volatile uint32 sse;

-  asm volatile (

-    "eor        v16.16b, v16.16b, v16.16b      \n"

-    "eor        v18.16b, v18.16b, v18.16b      \n"

-    "eor        v17.16b, v17.16b, v17.16b      \n"

-    "eor        v19.16b, v19.16b, v19.16b      \n"

+// 256 bits at a time

+// uses short accumulator which restricts count to 131 KB

+uint32_t HammingDistance_NEON(const uint8_t* src_a,

+                              const uint8_t* src_b,

+                              int count) {

+  uint32_t diff;

+  asm volatile(

+      "movi       v4.8h, #0                      \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"

-    MEMACCESS(1)

-    "ld1        {v1.16b}, [%1], #16            \n"

-    "subs       %w2, %w2, #16                  \n"

-    "usubl      v2.8h, v0.8b, v1.8b            \n"

-    "usubl2     v3.8h, v0.16b, v1.16b          \n"

-    "smlal      v16.4s, v2.4h, v2.4h           \n"

-    "smlal      v17.4s, v3.4h, v3.4h           \n"

-    "smlal2     v18.4s, v2.8h, v2.8h           \n"

-    "smlal2     v19.4s, v3.8h, v3.8h           \n"

-    "b.gt       1b                             \n"

+      "1:                                        \n"

+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"

+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"

+      "eor        v0.16b, v0.16b, v2.16b         \n"

+      "eor        v1.16b, v1.16b, v3.16b         \n"

+      "cnt        v0.16b, v0.16b                 \n"

+      "cnt        v1.16b, v1.16b                 \n"

+      "subs       %w2, %w2, #32                  \n"

+      "add        v0.16b, v0.16b, v1.16b         \n"

+      "uadalp     v4.8h, v0.16b                  \n"

+      "b.gt       1b                             \n"

-    "add        v16.4s, v16.4s, v17.4s         \n"

-    "add        v18.4s, v18.4s, v19.4s         \n"

-    "add        v19.4s, v16.4s, v18.4s         \n"

-    "addv       s0, v19.4s                     \n"

-    "fmov       %w3, s0                        \n"

-    : "+r"(src_a),

-      "+r"(src_b),

-      "+r"(count),

-      "=r"(sse)

-    :

-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");

+      "uaddlv     s4, v4.8h                      \n"

+      "fmov       %w3, s4                        \n"

+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)

+      :

+      : "cc", "v0", "v1", "v2", "v3", "v4");

+  return diff;

+}

+uint32_t SumSquareError_NEON(const uint8_t* src_a,

+                             const uint8_t* src_b,

+                             int count) {

+  uint32_t sse;

+  asm volatile(

+      "eor        v16.16b, v16.16b, v16.16b      \n"

+      "eor        v18.16b, v18.16b, v18.16b      \n"

+      "eor        v17.16b, v17.16b, v17.16b      \n"

+      "eor        v19.16b, v19.16b, v19.16b      \n"

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"

+      "ld1        {v1.16b}, [%1], #16            \n"

+      "subs       %w2, %w2, #16                  \n"

+      "usubl      v2.8h, v0.8b, v1.8b            \n"

+      "usubl2     v3.8h, v0.16b, v1.16b          \n"

+      "smlal      v16.4s, v2.4h, v2.4h           \n"

+      "smlal      v17.4s, v3.4h, v3.4h           \n"

+      "smlal2     v18.4s, v2.8h, v2.8h           \n"

+      "smlal2     v19.4s, v3.8h, v3.8h           \n"

+      "b.gt       1b                             \n"

+      "add        v16.4s, v16.4s, v17.4s         \n"

+      "add        v18.4s, v18.4s, v19.4s         \n"

+      "add        v19.4s, v16.4s, v18.4s         \n"

+      "addv       s0, v19.4s                     \n"

+      "fmov       %w3, s0                        \n"

+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)

+      :

+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");

   return sse;

--- a/third_party/libyuv/source/compare_win.cc

+++ b/third_party/libyuv/source/compare_win.cc

@@ -13,6 +13,10 @@

 #include "libyuv/compare_row.h"

 #include "libyuv/row.h"

+#if defined(_MSC_VER)

+#include <intrin.h>  // For __popcnt

+#endif

 #ifdef __cplusplus

 namespace libyuv {

 extern "C" {

@@ -19,14 +23,29 @@

 #endif

 // This module is for 32 bit Visual C x86 and clangcl

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

-__declspec(naked)

-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {

+uint32_t HammingDistance_SSE42(const uint8_t* src_a,

+                               const uint8_t* src_b,

+                               int count) {

+  uint32_t diff = 0u;

+  int i;

+  for (i = 0; i < count - 3; i += 4) {

+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT

+    src_a += 4;

+    src_b += 4;

+    diff += __popcnt(x);

+  }

+  return diff;

+}

+__declspec(naked) uint32_t

+    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {

   __asm {

-    mov        eax, [esp + 4]    // src_a

-    mov        edx, [esp + 8]    // src_b

-    mov        ecx, [esp + 12]   // count

+    mov        eax, [esp + 4]  // src_a

+    mov        edx, [esp + 8]  // src_b

+    mov        ecx, [esp + 12]  // count

     pxor       xmm0, xmm0

     pxor       xmm5, xmm5

@@ -61,13 +80,13 @@

 // Visual C 2012 required for AVX2.

 #if _MSC_VER >= 1700

 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.

-#pragma warning(disable: 4752)

-__declspec(naked)

-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {

+#pragma warning(disable : 4752)

+__declspec(naked) uint32_t

+    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {

   __asm {

-    mov        eax, [esp + 4]    // src_a

-    mov        edx, [esp + 8]    // src_b

-    mov        ecx, [esp + 12]   // count

+    mov        eax, [esp + 4]  // src_a

+    mov        edx, [esp + 8]  // src_b

+    mov        ecx, [esp + 12]  // count

     vpxor      ymm0, ymm0, ymm0  // sum

     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck

     sub        edx, eax

@@ -101,65 +120,65 @@

 #endif  // _MSC_VER >= 1700

-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16

+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16

 uvec32 kHashMul0 = {

-  0x0c3525e1,  // 33 ^ 15

-  0xa3476dc1,  // 33 ^ 14

-  0x3b4039a1,  // 33 ^ 13

-  0x4f5f0981,  // 33 ^ 12

+    0x0c3525e1,  // 33 ^ 15

+    0xa3476dc1,  // 33 ^ 14

+    0x3b4039a1,  // 33 ^ 13

+    0x4f5f0981,  // 33 ^ 12

};

 uvec32 kHashMul1 = {

-  0x30f35d61,  // 33 ^ 11

-  0x855cb541,  // 33 ^ 10

-  0x040a9121,  // 33 ^ 9

-  0x747c7101,  // 33 ^ 8

+    0x30f35d61,  // 33 ^ 11

+    0x855cb541,  // 33 ^ 10

+    0x040a9121,  // 33 ^ 9

+    0x747c7101,  // 33 ^ 8

};

 uvec32 kHashMul2 = {

-  0xec41d4e1,  // 33 ^ 7

-  0x4cfa3cc1,  // 33 ^ 6

-  0x025528a1,  // 33 ^ 5

-  0x00121881,  // 33 ^ 4

+    0xec41d4e1,  // 33 ^ 7

+    0x4cfa3cc1,  // 33 ^ 6

+    0x025528a1,  // 33 ^ 5

+    0x00121881,  // 33 ^ 4

};

 uvec32 kHashMul3 = {

-  0x00008c61,  // 33 ^ 3

-  0x00000441,  // 33 ^ 2

-  0x00000021,  // 33 ^ 1

-  0x00000001,  // 33 ^ 0

+    0x00008c61,  // 33 ^ 3

+    0x00000441,  // 33 ^ 2

+    0x00000021,  // 33 ^ 1

+    0x00000001,  // 33 ^ 0

};

-__declspec(naked)

-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {

+__declspec(naked) uint32_t

+    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {

   __asm {

-    mov        eax, [esp + 4]    // src

-    mov        ecx, [esp + 8]    // count

+    mov        eax, [esp + 4]  // src

+    mov        ecx, [esp + 8]  // count

     movd       xmm0, [esp + 12]  // seed

-    pxor       xmm7, xmm7        // constant 0 for unpck

+    pxor       xmm7, xmm7  // constant 0 for unpck

     movdqa     xmm6, xmmword ptr kHash16x33

   wloop:

-    movdqu     xmm1, [eax]       // src[0-15]

+    movdqu     xmm1, [eax]  // src[0-15]

     lea        eax, [eax + 16]

-    pmulld     xmm0, xmm6        // hash *= 33 ^ 16

+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16

     movdqa     xmm5, xmmword ptr kHashMul0

     movdqa     xmm2, xmm1

-    punpcklbw  xmm2, xmm7        // src[0-7]

+    punpcklbw  xmm2, xmm7  // src[0-7]

     movdqa     xmm3, xmm2

-    punpcklwd  xmm3, xmm7        // src[0-3]

+    punpcklwd  xmm3, xmm7  // src[0-3]

     pmulld     xmm3, xmm5

     movdqa     xmm5, xmmword ptr kHashMul1

     movdqa     xmm4, xmm2

-    punpckhwd  xmm4, xmm7        // src[4-7]

+    punpckhwd  xmm4, xmm7  // src[4-7]

     pmulld     xmm4, xmm5

     movdqa     xmm5, xmmword ptr kHashMul2

-    punpckhbw  xmm1, xmm7        // src[8-15]

+    punpckhbw  xmm1, xmm7  // src[8-15]

     movdqa     xmm2, xmm1

-    punpcklwd  xmm2, xmm7        // src[8-11]

+    punpcklwd  xmm2, xmm7  // src[8-11]

     pmulld     xmm2, xmm5

     movdqa     xmm5, xmmword ptr kHashMul3

-    punpckhwd  xmm1, xmm7        // src[12-15]

+    punpckhwd  xmm1, xmm7  // src[12-15]

     pmulld     xmm1, xmm5

-    paddd      xmm3, xmm4        // add 16 results

+    paddd      xmm3, xmm4  // add 16 results

     paddd      xmm1, xmm2

     paddd      xmm1, xmm3

@@ -171,7 +190,7 @@

     sub        ecx, 16

     jg         wloop

-    movd       eax, xmm0         // return hash

+    movd       eax, xmm0  // return hash

ret

@@ -178,11 +197,11 @@

 // Visual C 2012 required for AVX2.

 #if _MSC_VER >= 1700

-__declspec(naked)

-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {

+__declspec(naked) uint32_t

+    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {

   __asm {

-    mov        eax, [esp + 4]    // src

-    mov        ecx, [esp + 8]    // count

+    mov        eax, [esp + 4]  // src

+    mov        ecx, [esp + 8]  // count

     vmovd      xmm0, [esp + 12]  // seed

   wloop:

@@ -196,7 +215,7 @@

     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2

     lea        eax, [eax + 16]

     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3

-    vpaddd     xmm3, xmm3, xmm4        // add 16 results

+    vpaddd     xmm3, xmm3, xmm4  // add 16 results

     vpaddd     xmm1, xmm1, xmm2

     vpaddd     xmm1, xmm1, xmm3

     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords

@@ -207,7 +226,7 @@

     sub        ecx, 16

     jg         wloop

-    vmovd      eax, xmm0         // return hash

+    vmovd      eax, xmm0  // return hash

     vzeroupper

ret

--- a/third_party/libyuv/source/convert.cc

+++ b/third_party/libyuv/source/convert.cc

@@ -14,8 +14,8 @@

 #include "libyuv/cpu_id.h"

 #include "libyuv/planar_functions.h"

 #include "libyuv/rotate.h"

-#include "libyuv/scale.h"  // For ScalePlane()

 #include "libyuv/row.h"

+#include "libyuv/scale.h"  // For ScalePlane()

 #ifdef __cplusplus

 namespace libyuv {

@@ -28,14 +28,22 @@

 // Any I4xx To I420 format with mirroring.

-static int I4xxToI420(const uint8* src_y, int src_stride_y,

-                      const uint8* src_u, int src_stride_u,

-                      const uint8* src_v, int src_stride_v,

-                      uint8* dst_y, int dst_stride_y,

-                      uint8* dst_u, int dst_stride_u,

-                      uint8* dst_v, int dst_stride_v,

-                      int src_y_width, int src_y_height,

-                      int src_uv_width, int src_uv_height) {

+static int I4xxToI420(const uint8_t* src_y,

+                      int src_stride_y,

+                      const uint8_t* src_u,

+                      int src_stride_u,

+                      const uint8_t* src_v,

+                      int src_stride_v,

+                      uint8_t* dst_y,

+                      int dst_stride_y,

+                      uint8_t* dst_u,

+                      int dst_stride_u,

+                      uint8_t* dst_v,

+                      int dst_stride_v,

+                      int src_y_width,

+                      int src_y_height,

+                      int src_uv_width,

+                      int src_uv_height) {

   const int dst_y_width = Abs(src_y_width);

   const int dst_y_height = Abs(src_y_height);

   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);

@@ -44,35 +52,37 @@

     return -1;

   if (dst_y) {

-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,

-               dst_y, dst_stride_y, dst_y_width, dst_y_height,

-               kFilterBilinear);

+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,

+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);

-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,

-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,

-             kFilterBilinear);

-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,

-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,

-             kFilterBilinear);

+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,

+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);

+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,

+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);

   return 0;

-// Copy I420 with optional flipping

+// Copy I420 with optional flipping.

 // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure

 // is does row coalescing.

 LIBYUV_API

-int I420Copy(const uint8* src_y, int src_stride_y,

-             const uint8* src_u, int src_stride_u,

-             const uint8* src_v, int src_stride_v,

-             uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int width, int height) {

+int I420Copy(const uint8_t* src_y,

+             int src_stride_y,

+             const uint8_t* src_u,

+             int src_stride_u,

+             const uint8_t* src_v,

+             int src_stride_v,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!src_u || !src_v ||

-      !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -96,79 +106,152 @@

   return 0;

+// Copy I010 with optional flipping.

+LIBYUV_API

+int I010Copy(const uint16_t* src_y,

+             int src_stride_y,

+             const uint16_t* src_u,

+             int src_stride_u,

+             const uint16_t* src_v,

+             int src_stride_v,

+             uint16_t* dst_y,

+             int dst_stride_y,

+             uint16_t* dst_u,

+             int dst_stride_u,

+             uint16_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (halfheight - 1) * src_stride_u;

+    src_v = src_v + (halfheight - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  if (dst_y) {

+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+  }

+  // Copy UV planes.

+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);

+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);

+  return 0;

+}

+// Convert 10 bit YUV to 8 bit.

+LIBYUV_API

+int I010ToI420(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (halfheight - 1) * src_stride_u;

+    src_v = src_v + (halfheight - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  // Convert Y plane.

+  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,

+                    height);

+  // Convert UV planes.

+  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,

+                    halfheight);

+  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,

+                    halfheight);

+  return 0;

+}

 // 422 chroma is 1/2 width, 1x height

 // 420 chroma is 1/2 width, 1/2 height

 LIBYUV_API

-int I422ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int I422ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   const int src_uv_width = SUBSAMPLE(width, 1, 1);

-  return I4xxToI420(src_y, src_stride_y,

-                    src_u, src_stride_u,

-                    src_v, src_stride_v,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

-                    width, height,

-                    src_uv_width, height);

+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,

+                    dst_v, dst_stride_v, width, height, src_uv_width, height);

 // 444 chroma is 1x width, 1x height

 // 420 chroma is 1/2 width, 1/2 height

 LIBYUV_API

-int I444ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

-  return I4xxToI420(src_y, src_stride_y,

-                    src_u, src_stride_u,

-                    src_v, src_stride_v,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

-                    width, height,

-                    width, height);

+int I444ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,

+                    dst_v, dst_stride_v, width, height, width, height);

-// 411 chroma is 1/4 width, 1x height

-// 420 chroma is 1/2 width, 1/2 height

-LIBYUV_API

-int I411ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

-  const int src_uv_width = SUBSAMPLE(width, 3, 2);

-  return I4xxToI420(src_y, src_stride_y,

-                    src_u, src_stride_u,

-                    src_v, src_stride_v,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

-                    width, height,

-                    src_uv_width, height);

-}

 // I400 is greyscale typically used in MJPG

 LIBYUV_API

-int I400ToI420(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int I400ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -186,11 +269,15 @@

   return 0;

-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,

-                       uint8* dst, int dst_stride,

-                       int width, int height) {

+static void CopyPlane2(const uint8_t* src,

+                       int src_stride_0,

+                       int src_stride_1,

+                       uint8_t* dst,

+                       int dst_stride,

+                       int width,

+                       int height) {

   int y;

-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;

 #if defined(HAS_COPYROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;

@@ -211,11 +298,6 @@

     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;

 #endif

-#if defined(HAS_COPYROW_MIPS)

-  if (TestCpuFlag(kCpuHasMIPS)) {

-    CopyRow = CopyRow_MIPS;

-  }

-#endif

   // Copy plane

   for (y = 0; y < height - 1; y += 2) {

@@ -238,17 +320,22 @@

 // src_stride_m420 is row planar. Normally this will be the width in pixels.

 //   The UV plane is half width, but 2 values, so src_stride_m420 applies to

 //   this as well as the two Y planes.

-static int X420ToI420(const uint8* src_y,

-                      int src_stride_y0, int src_stride_y1,

-                      const uint8* src_uv, int src_stride_uv,

-                      uint8* dst_y, int dst_stride_y,

-                      uint8* dst_u, int dst_stride_u,

-                      uint8* dst_v, int dst_stride_v,

-                      int width, int height) {

+static int X420ToI420(const uint8_t* src_y,

+                      int src_stride_y0,

+                      int src_stride_y1,

+                      const uint8_t* src_uv,

+                      int src_stride_uv,

+                      uint8_t* dst_y,

+                      int dst_stride_y,

+                      uint8_t* dst_u,

+                      int dst_stride_u,

+                      uint8_t* dst_v,

+                      int dst_stride_v,

+                      int width,

+                      int height) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!src_uv || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -265,8 +352,7 @@

     dst_stride_v = -dst_stride_v;

   // Coalesce rows.

-  if (src_stride_y0 == width &&

-      src_stride_y1 == width &&

+  if (src_stride_y0 == width && src_stride_y1 == width &&

       dst_stride_y == width) {

     width *= height;

     height = 1;

@@ -273,8 +359,7 @@

     src_stride_y0 = src_stride_y1 = dst_stride_y = 0;

   // Coalesce rows.

-  if (src_stride_uv == halfwidth * 2 &&

-      dst_stride_u == halfwidth &&

+  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&

       dst_stride_v == halfwidth) {

     halfwidth *= halfheight;

     halfheight = 1;

@@ -299,63 +384,78 @@

 // Convert NV12 to I420.

 LIBYUV_API

-int NV12ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_uv, int src_stride_uv,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

-  return X420ToI420(src_y, src_stride_y, src_stride_y,

-                    src_uv, src_stride_uv,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

-                    width, height);

+int NV12ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_uv,

+               int src_stride_uv,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,

+                    dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,

+                    dst_stride_v, width, height);

 // Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.

 LIBYUV_API

-int NV21ToI420(const uint8* src_y, int src_stride_y,

-               const uint8* src_vu, int src_stride_vu,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

-  return X420ToI420(src_y, src_stride_y, src_stride_y,

-                    src_vu, src_stride_vu,

-                    dst_y, dst_stride_y,

-                    dst_v, dst_stride_v,

-                    dst_u, dst_stride_u,

-                    width, height);

+int NV21ToI420(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_vu,

+               int src_stride_vu,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,

+                    dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,

+                    dst_stride_u, width, height);

 // Convert M420 to I420.

 LIBYUV_API

-int M420ToI420(const uint8* src_m420, int src_stride_m420,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int M420ToI420(const uint8_t* src_m420,

+               int src_stride_m420,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,

-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,

+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,

                     width, height);

 // Convert YUY2 to I420.

 LIBYUV_API

-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int YUY2ToI420(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,

-      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;

-  void (*YUY2ToYRow)(const uint8* src_yuy2,

-      uint8* dst_y, int width) = YUY2ToYRow_C;

+  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      YUY2ToUVRow_C;

+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =

+      YUY2ToYRow_C;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -392,6 +492,16 @@

 #endif

+#if defined(HAS_YUY2TOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    YUY2ToYRow = YUY2ToYRow_Any_MSA;

+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      YUY2ToYRow = YUY2ToYRow_MSA;

+      YUY2ToUVRow = YUY2ToUVRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);

@@ -411,16 +521,22 @@

 // Convert UYVY to I420.

 LIBYUV_API

-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int UYVYToI420(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,

-      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;

-  void (*UYVYToYRow)(const uint8* src_uyvy,

-      uint8* dst_y, int width) = UYVYToYRow_C;

+  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      UYVYToUVRow_C;

+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =

+      UYVYToYRow_C;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -457,6 +573,16 @@

 #endif

+#if defined(HAS_UYVYTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    UYVYToYRow = UYVYToYRow_Any_MSA;

+    UYVYToUVRow = UYVYToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      UYVYToYRow = UYVYToYRow_MSA;

+      UYVYToUVRow = UYVYToUVRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);

@@ -476,19 +602,23 @@

 // Convert ARGB to I420.

 LIBYUV_API

-int ARGBToI420(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int ARGBToI420(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

-  if (!src_argb ||

-      !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -533,6 +663,22 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVRow = ARGBToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);

@@ -552,19 +698,23 @@

 // Convert BGRA to I420.

 LIBYUV_API

-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int BGRAToI420(const uint8_t* src_bgra,

+               int src_stride_bgra,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,

-      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;

-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =

+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      BGRAToUVRow_C;

+  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =

       BGRAToYRow_C;

-  if (!src_bgra ||

-      !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -592,13 +742,29 @@

 #endif

 #if defined(HAS_BGRATOUVROW_NEON)

-    if (TestCpuFlag(kCpuHasNEON)) {

-      BGRAToUVRow = BGRAToUVRow_Any_NEON;

-      if (IS_ALIGNED(width, 16)) {

-        BGRAToUVRow = BGRAToUVRow_NEON;

-      }

+  if (TestCpuFlag(kCpuHasNEON)) {

+    BGRAToUVRow = BGRAToUVRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      BGRAToUVRow = BGRAToUVRow_NEON;

+  }

 #endif

+#if defined(HAS_BGRATOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    BGRAToYRow = BGRAToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      BGRAToYRow = BGRAToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_BGRATOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    BGRAToUVRow = BGRAToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      BGRAToUVRow = BGRAToUVRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);

@@ -618,19 +784,23 @@

 // Convert ABGR to I420.

 LIBYUV_API

-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int ABGRToI420(const uint8_t* src_abgr,

+               int src_stride_abgr,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,

-      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;

-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =

+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ABGRToUVRow_C;

+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =

       ABGRToYRow_C;

-  if (!src_abgr ||

-      !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -665,6 +835,22 @@

 #endif

+#if defined(HAS_ABGRTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ABGRToYRow = ABGRToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ABGRToYRow = ABGRToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ABGRTOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ABGRToUVRow = ABGRToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ABGRToUVRow = ABGRToUVRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);

@@ -684,19 +870,23 @@

 // Convert RGBA to I420.

 LIBYUV_API

-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int RGBAToI420(const uint8_t* src_rgba,

+               int src_stride_rgba,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,

-      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;

-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =

+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      RGBAToUVRow_C;

+  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =

       RGBAToYRow_C;

-  if (!src_rgba ||

-      !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -731,6 +921,22 @@

 #endif

+#if defined(HAS_RGBATOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RGBAToYRow = RGBAToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RGBAToYRow = RGBAToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_RGBATOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RGBAToUVRow = RGBAToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RGBAToUVRow = RGBAToUVRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);

@@ -750,27 +956,33 @@

 // Convert RGB24 to I420.

 LIBYUV_API

-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,

-                uint8* dst_y, int dst_stride_y,

-                uint8* dst_u, int dst_stride_u,

-                uint8* dst_v, int dst_stride_v,

-                int width, int height) {

+int RGB24ToI420(const uint8_t* src_rgb24,

+                int src_stride_rgb24,

+                uint8_t* dst_y,

+                int dst_stride_y,

+                uint8_t* dst_u,

+                int dst_stride_u,

+                uint8_t* dst_v,

+                int dst_stride_v,

+                int width,

+                int height) {

   int y;

-#if defined(HAS_RGB24TOYROW_NEON)

-  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,

-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;

-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =

+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))

+  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,

+                       uint8_t* dst_u, uint8_t* dst_v, int width) =

+      RGB24ToUVRow_C;

+  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =

       RGB24ToYRow_C;

 #else

-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =

       RGB24ToARGBRow_C;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

 #endif

-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -792,6 +1004,15 @@

+#elif defined(HAS_RGB24TOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;

+    RGB24ToYRow = RGB24ToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RGB24ToYRow = RGB24ToYRow_MSA;

+      RGB24ToUVRow = RGB24ToUVRow_MSA;

+    }

+  }

 // Other platforms do intermediate conversion from RGB24 to ARGB.

 #else

 #if defined(HAS_RGB24TOARGBROW_SSSE3)

@@ -822,7 +1043,10 @@

 #endif

+#endif

+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))

     // Allocate 2 rows of ARGB.

     const int kRowSize = (width * 4 + 31) & ~31;

     align_buffer_64(row, kRowSize * 2);

@@ -829,7 +1053,7 @@

 #endif

     for (y = 0; y < height - 1; y += 2) {

-#if defined(HAS_RGB24TOYROW_NEON)

+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))

       RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);

       RGB24ToYRow(src_rgb24, dst_y, width);

       RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);

@@ -846,7 +1070,7 @@

       dst_v += dst_stride_v;

     if (height & 1) {

-#if defined(HAS_RGB24TOYROW_NEON)

+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))

       RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);

       RGB24ToYRow(src_rgb24, dst_y, width);

 #else

@@ -855,36 +1079,41 @@

       ARGBToYRow(row, dst_y, width);

 #endif

-#if !defined(HAS_RGB24TOYROW_NEON)

+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))

     free_aligned_buffer_64(row);

-  }

 #endif

+  }

   return 0;

 // Convert RAW to I420.

 LIBYUV_API

-int RAWToI420(const uint8* src_raw, int src_stride_raw,

-              uint8* dst_y, int dst_stride_y,

-              uint8* dst_u, int dst_stride_u,

-              uint8* dst_v, int dst_stride_v,

-              int width, int height) {

+int RAWToI420(const uint8_t* src_raw,

+              int src_stride_raw,

+              uint8_t* dst_y,

+              int dst_stride_y,

+              uint8_t* dst_u,

+              int dst_stride_u,

+              uint8_t* dst_v,

+              int dst_stride_v,

+              int width,

+              int height) {

   int y;

-#if defined(HAS_RAWTOYROW_NEON)

-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,

-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;

-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =

+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))

+  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,

+                     uint8_t* dst_v, int width) = RAWToUVRow_C;

+  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =

       RAWToYRow_C;

 #else

-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =

       RAWToARGBRow_C;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

 #endif

-  if (!src_raw || !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -906,6 +1135,15 @@

+#elif defined(HAS_RAWTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RAWToUVRow = RAWToUVRow_Any_MSA;

+    RAWToYRow = RAWToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RAWToYRow = RAWToYRow_MSA;

+      RAWToUVRow = RAWToUVRow_MSA;

+    }

+  }

 // Other platforms do intermediate conversion from RAW to ARGB.

 #else

 #if defined(HAS_RAWTOARGBROW_SSSE3)

@@ -936,7 +1174,10 @@

 #endif

+#endif

+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))

     // Allocate 2 rows of ARGB.

     const int kRowSize = (width * 4 + 31) & ~31;

     align_buffer_64(row, kRowSize * 2);

@@ -943,7 +1184,7 @@

 #endif

     for (y = 0; y < height - 1; y += 2) {

-#if defined(HAS_RAWTOYROW_NEON)

+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))

       RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);

       RAWToYRow(src_raw, dst_y, width);

       RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);

@@ -960,7 +1201,7 @@

       dst_v += dst_stride_v;

     if (height & 1) {

-#if defined(HAS_RAWTOYROW_NEON)

+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))

       RAWToUVRow(src_raw, 0, dst_u, dst_v, width);

       RAWToYRow(src_raw, dst_y, width);

 #else

@@ -969,36 +1210,42 @@

       ARGBToYRow(row, dst_y, width);

 #endif

-#if !defined(HAS_RAWTOYROW_NEON)

+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))

     free_aligned_buffer_64(row);

-  }

 #endif

+  }

   return 0;

 // Convert RGB565 to I420.

 LIBYUV_API

-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,

-                 uint8* dst_y, int dst_stride_y,

-                 uint8* dst_u, int dst_stride_u,

-                 uint8* dst_v, int dst_stride_v,

-                 int width, int height) {

+int RGB565ToI420(const uint8_t* src_rgb565,

+                 int src_stride_rgb565,

+                 uint8_t* dst_y,

+                 int dst_stride_y,

+                 uint8_t* dst_u,

+                 int dst_stride_u,

+                 uint8_t* dst_v,

+                 int dst_stride_v,

+                 int width,

+                 int height) {

   int y;

-#if defined(HAS_RGB565TOYROW_NEON)

-  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,

-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;

-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =

+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))

+  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,

+                        uint8_t* dst_u, uint8_t* dst_v, int width) =

+      RGB565ToUVRow_C;

+  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =

       RGB565ToYRow_C;

 #else

-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

-      RGB565ToARGBRow_C;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,

+                          int width) = RGB565ToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

 #endif

-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1020,6 +1267,15 @@

+#elif defined(HAS_RGB565TOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;

+    RGB565ToYRow = RGB565ToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RGB565ToYRow = RGB565ToYRow_MSA;

+      RGB565ToUVRow = RGB565ToUVRow_MSA;

+    }

+  }

 // Other platforms do intermediate conversion from RGB565 to ARGB.

 #else

 #if defined(HAS_RGB565TOARGBROW_SSE2)

@@ -1058,14 +1314,15 @@

 #endif

+#endif

+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))

     // Allocate 2 rows of ARGB.

     const int kRowSize = (width * 4 + 31) & ~31;

     align_buffer_64(row, kRowSize * 2);

 #endif

     for (y = 0; y < height - 1; y += 2) {

-#if defined(HAS_RGB565TOYROW_NEON)

+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))

       RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);

       RGB565ToYRow(src_rgb565, dst_y, width);

       RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);

@@ -1082,7 +1339,7 @@

       dst_v += dst_stride_v;

     if (height & 1) {

-#if defined(HAS_RGB565TOYROW_NEON)

+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))

       RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);

       RGB565ToYRow(src_rgb565, dst_y, width);

 #else

@@ -1091,36 +1348,43 @@

       ARGBToYRow(row, dst_y, width);

 #endif

-#if !defined(HAS_RGB565TOYROW_NEON)

+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))

     free_aligned_buffer_64(row);

-  }

 #endif

+  }

   return 0;

 // Convert ARGB1555 to I420.

 LIBYUV_API

-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,

-                   uint8* dst_y, int dst_stride_y,

-                   uint8* dst_u, int dst_stride_u,

-                   uint8* dst_v, int dst_stride_v,

-                   int width, int height) {

+int ARGB1555ToI420(const uint8_t* src_argb1555,

+                   int src_stride_argb1555,

+                   uint8_t* dst_y,

+                   int dst_stride_y,

+                   uint8_t* dst_u,

+                   int dst_stride_u,

+                   uint8_t* dst_v,

+                   int dst_stride_v,

+                   int width,

+                   int height) {

   int y;

-#if defined(HAS_ARGB1555TOYROW_NEON)

-  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,

-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;

-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =

-      ARGB1555ToYRow_C;

+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))

+  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,

+                          uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGB1555ToUVRow_C;

+  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,

+                         int width) = ARGB1555ToYRow_C;

 #else

-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

-      ARGB1555ToARGBRow_C;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,

+                            int width) = ARGB1555ToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

 #endif

-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||

+      height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1142,6 +1406,15 @@

+#elif defined(HAS_ARGB1555TOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;

+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGB1555ToYRow = ARGB1555ToYRow_MSA;

+      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;

+    }

+  }

 // Other platforms do intermediate conversion from ARGB1555 to ARGB.

 #else

 #if defined(HAS_ARGB1555TOARGBROW_SSE2)

@@ -1180,7 +1453,9 @@

 #endif

+#endif

+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))

     // Allocate 2 rows of ARGB.

     const int kRowSize = (width * 4 + 31) & ~31;

     align_buffer_64(row, kRowSize * 2);

@@ -1187,7 +1462,7 @@

 #endif

     for (y = 0; y < height - 1; y += 2) {

-#if defined(HAS_ARGB1555TOYROW_NEON)

+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))

       ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);

       ARGB1555ToYRow(src_argb1555, dst_y, width);

       ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,

@@ -1206,7 +1481,7 @@

       dst_v += dst_stride_v;

     if (height & 1) {

-#if defined(HAS_ARGB1555TOYROW_NEON)

+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))

       ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);

       ARGB1555ToYRow(src_argb1555, dst_y, width);

 #else

@@ -1215,36 +1490,43 @@

       ARGBToYRow(row, dst_y, width);

 #endif

-#if !defined(HAS_ARGB1555TOYROW_NEON)

+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))

     free_aligned_buffer_64(row);

-  }

 #endif

+  }

   return 0;

 // Convert ARGB4444 to I420.

 LIBYUV_API

-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,

-                   uint8* dst_y, int dst_stride_y,

-                   uint8* dst_u, int dst_stride_u,

-                   uint8* dst_v, int dst_stride_v,

-                   int width, int height) {

+int ARGB4444ToI420(const uint8_t* src_argb4444,

+                   int src_stride_argb4444,

+                   uint8_t* dst_y,

+                   int dst_stride_y,

+                   uint8_t* dst_u,

+                   int dst_stride_u,

+                   uint8_t* dst_v,

+                   int dst_stride_v,

+                   int width,

+                   int height) {

   int y;

 #if defined(HAS_ARGB4444TOYROW_NEON)

-  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,

-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;

-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =

-      ARGB4444ToYRow_C;

+  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,

+                          uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGB4444ToUVRow_C;

+  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,

+                         int width) = ARGB4444ToYRow_C;

 #else

-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

-      ARGB4444ToARGBRow_C;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,

+                            int width) = ARGB4444ToARGBRow_C;

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

 #endif

-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||

+      height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1284,6 +1566,14 @@

 #endif

+#if defined(HAS_ARGB4444TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;

@@ -1304,7 +1594,22 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVRow = ARGBToUVRow_Any_MSA;

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+      if (IS_ALIGNED(width, 32)) {

+        ARGBToUVRow = ARGBToUVRow_MSA;

+      }

+    }

+  }

+#endif

+#endif

+#if !defined(HAS_ARGB4444TOYROW_NEON)

     // Allocate 2 rows of ARGB.

     const int kRowSize = (width * 4 + 31) & ~31;

     align_buffer_64(row, kRowSize * 2);

@@ -1341,13 +1646,15 @@

 #if !defined(HAS_ARGB4444TOYROW_NEON)

     free_aligned_buffer_64(row);

-  }

 #endif

+  }

   return 0;

-static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv,

-                        uint8* dst_u, int width) {

+static void SplitPixels(const uint8_t* src_u,

+                        int src_pixel_stride_uv,

+                        uint8_t* dst_u,

+                        int width) {

   int i;

   for (i = 0; i < width; ++i) {

     *dst_u = *src_u;

@@ -1358,21 +1665,26 @@

 // Convert Android420 to I420.

 LIBYUV_API

-int Android420ToI420(const uint8* src_y, int src_stride_y,

-                     const uint8* src_u, int src_stride_u,

-                     const uint8* src_v, int src_stride_v,

+int Android420ToI420(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_u,

+                     int src_stride_u,

+                     const uint8_t* src_v,

+                     int src_stride_v,

                      int src_pixel_stride_uv,

-                     uint8* dst_y, int dst_stride_y,

-                     uint8* dst_u, int dst_stride_u,

-                     uint8* dst_v, int dst_stride_v,

-                     int width, int height) {

+                     uint8_t* dst_y,

+                     int dst_stride_y,

+                     uint8_t* dst_u,

+                     int dst_stride_u,

+                     uint8_t* dst_v,

+                     int dst_stride_v,

+                     int width,

+                     int height) {

   int y;

-  const int vu_off = src_v - src_u;

+  const ptrdiff_t vu_off = src_v - src_u;

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!src_u || !src_v ||

-      !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1396,15 +1708,16 @@

     CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);

     CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);

     return 0;

-  // Split UV planes - NV21

-  } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&

-             src_stride_u == src_stride_v) {

+    // Split UV planes - NV21

+  }

+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&

+      src_stride_u == src_stride_v) {

     SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,

                  halfwidth, halfheight);

     return 0;

-  // Split UV planes - NV12

-  } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&

-             src_stride_u == src_stride_v) {

+    // Split UV planes - NV12

+  }

+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {

     SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,

                  halfwidth, halfheight);

     return 0;

--- a/third_party/libyuv/source/convert_argb.cc

+++ b/third_party/libyuv/source/convert_argb.cc

@@ -26,11 +26,13 @@

 // Copy ARGB with optional flipping

 LIBYUV_API

-int ARGBCopy(const uint8* src_argb, int src_stride_argb,

-             uint8* dst_argb, int dst_stride_argb,

-             int width, int height) {

-  if (!src_argb || !dst_argb ||

-      width <= 0 || height == 0) {

+int ARGBCopy(const uint8_t* src_argb,

+             int src_stride_argb,

+             uint8_t* dst_argb,

+             int dst_stride_argb,

+             int width,

+             int height) {

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -40,27 +42,29 @@

     src_stride_argb = -src_stride_argb;

-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,

-            width * 4, height);

+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,

+            height);

   return 0;

-// Convert I422 to ARGB with matrix

-static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,

-                            const uint8* src_u, int src_stride_u,

-                            const uint8* src_v, int src_stride_v,

-                            uint8* dst_argb, int dst_stride_argb,

+// Convert I420 to ARGB with matrix

+static int I420ToARGBMatrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_u,

+                            int src_stride_u,

+                            const uint8_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_argb,

+                            int dst_stride_argb,

                             const struct YuvConstants* yuvconstants,

-                            int width, int height) {

+                            int width,

+                            int height) {

   int y;

-  void (*I422ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = I422ToARGBRow_C;

-  if (!src_y || !src_u || !src_v || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I422ToARGBRow_C;

+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -93,13 +97,12 @@

 #endif

-#if defined(HAS_I422TOARGBROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    I422ToARGBRow = I422ToARGBRow_DSPR2;

+#if defined(HAS_I422TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToARGBRow = I422ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_MSA;

+    }

 #endif

@@ -117,30 +120,36 @@

 // Convert I420 to ARGB.

 LIBYUV_API

-int I420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I420ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvI601Constants,

-                          width, height);

+int I420ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvI601Constants, width, height);

 // Convert I420 to ABGR.

 LIBYUV_API

-int I420ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height) {

-  return I420ToARGBMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_abgr, dst_stride_abgr,

+int I420ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

                           &kYvuI601Constants,  // Use Yvu matrix

                           width, height);

@@ -147,30 +156,36 @@

 // Convert J420 to ARGB.

 LIBYUV_API

-int J420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I420ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvJPEGConstants,

-                          width, height);

+int J420ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvJPEGConstants, width, height);

 // Convert J420 to ABGR.

 LIBYUV_API

-int J420ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height) {

-  return I420ToARGBMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_abgr, dst_stride_abgr,

+int J420ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

                           &kYvuJPEGConstants,  // Use Yvu matrix

                           width, height);

@@ -177,51 +192,58 @@

 // Convert H420 to ARGB.

 LIBYUV_API

-int H420ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I420ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvH709Constants,

-                          width, height);

+int H420ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvH709Constants, width, height);

 // Convert H420 to ABGR.

 LIBYUV_API

-int H420ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height) {

-  return I420ToARGBMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_abgr, dst_stride_abgr,

+int H420ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

                           &kYvuH709Constants,  // Use Yvu matrix

                           width, height);

 // Convert I422 to ARGB with matrix

-static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,

-                            const uint8* src_u, int src_stride_u,

-                            const uint8* src_v, int src_stride_v,

-                            uint8* dst_argb, int dst_stride_argb,

+static int I422ToARGBMatrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_u,

+                            int src_stride_u,

+                            const uint8_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_argb,

+                            int dst_stride_argb,

                             const struct YuvConstants* yuvconstants,

-                            int width, int height) {

+                            int width,

+                            int height) {

   int y;

-  void (*I422ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = I422ToARGBRow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I422ToARGBRow_C;

+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -231,10 +253,8 @@

     dst_stride_argb = -dst_stride_argb;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 2 == width &&

-      src_stride_v * 2 == width &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_y == width && src_stride_u * 2 == width &&

+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

@@ -263,13 +283,12 @@

 #endif

-#if defined(HAS_I422TOARGBROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    I422ToARGBRow = I422ToARGBRow_DSPR2;

+#if defined(HAS_I422TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToARGBRow = I422ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_MSA;

+    }

 #endif

@@ -285,30 +304,36 @@

 // Convert I422 to ARGB.

 LIBYUV_API

-int I422ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I422ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvI601Constants,

-                          width, height);

+int I422ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvI601Constants, width, height);

 // Convert I422 to ABGR.

 LIBYUV_API

-int I422ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height) {

-  return I422ToARGBMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_abgr, dst_stride_abgr,

+int I422ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

                           &kYvuI601Constants,  // Use Yvu matrix

                           width, height);

@@ -315,30 +340,36 @@

 // Convert J422 to ARGB.

 LIBYUV_API

-int J422ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I422ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvJPEGConstants,

-                          width, height);

+int J422ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvJPEGConstants, width, height);

 // Convert J422 to ABGR.

 LIBYUV_API

-int J422ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height) {

-  return I422ToARGBMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_abgr, dst_stride_abgr,

+int J422ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

                           &kYvuJPEGConstants,  // Use Yvu matrix

                           width, height);

@@ -345,165 +376,308 @@

 // Convert H422 to ARGB.

 LIBYUV_API

-int H422ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I422ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvH709Constants,

-                          width, height);

+int H422ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvH709Constants, width, height);

 // Convert H422 to ABGR.

 LIBYUV_API

-int H422ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height) {

-  return I422ToARGBMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_abgr, dst_stride_abgr,

+int H422ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

                           &kYvuH709Constants,  // Use Yvu matrix

                           width, height);

-// Convert I444 to ARGB with matrix

-static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,

-                            const uint8* src_u, int src_stride_u,

-                            const uint8* src_v, int src_stride_v,

-                            uint8* dst_argb, int dst_stride_argb,

+// Convert 10 bit YUV to ARGB with matrix

+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to

+// multiply 10 bit yuv into high bits to allow any number of bits.

+static int I010ToAR30Matrix(const uint16_t* src_y,

+                            int src_stride_y,

+                            const uint16_t* src_u,

+                            int src_stride_u,

+                            const uint16_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_ar30,

+                            int dst_stride_ar30,

                             const struct YuvConstants* yuvconstants,

-                            int width, int height) {

+                            int width,

+                            int height) {

   int y;

-  void (*I444ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = I444ToARGBRow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,

+                        const uint16_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I210ToAR30Row_C;

+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

-    dst_stride_argb = -dst_stride_argb;

+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;

+    dst_stride_ar30 = -dst_stride_ar30;

-  // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u == width &&

-      src_stride_v == width &&

-      dst_stride_argb == width * 4) {

-    width *= height;

-    height = 1;

-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

-  }

-#if defined(HAS_I444TOARGBROW_SSSE3)

+#if defined(HAS_I210TOAR30ROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;

+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

-      I444ToARGBRow = I444ToARGBRow_SSSE3;

+      I210ToAR30Row = I210ToAR30Row_SSSE3;

 #endif

-#if defined(HAS_I444TOARGBROW_AVX2)

+#if defined(HAS_I210TOAR30ROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;

+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;

     if (IS_ALIGNED(width, 16)) {

-      I444ToARGBRow = I444ToARGBRow_AVX2;

+      I210ToAR30Row = I210ToAR30Row_AVX2;

 #endif

-#if defined(HAS_I444TOARGBROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    I444ToARGBRow = I444ToARGBRow_Any_NEON;

+  for (y = 0; y < height; ++y) {

+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);

+    dst_ar30 += dst_stride_ar30;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I010 to AR30.

+LIBYUV_API

+int I010ToAR30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height) {

+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_ar30, dst_stride_ar30,

+                          &kYuvI601Constants, width, height);

+}

+// Convert H010 to AR30.

+LIBYUV_API

+int H010ToAR30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height) {

+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_ar30, dst_stride_ar30,

+                          &kYuvH709Constants, width, height);

+}

+// Convert I010 to AB30.

+LIBYUV_API

+int I010ToAB30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ab30,

+               int dst_stride_ab30,

+               int width,

+               int height) {

+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,

+                          src_stride_u, dst_ab30, dst_stride_ab30,

+                          &kYvuI601Constants, width, height);

+}

+// Convert H010 to AB30.

+LIBYUV_API

+int H010ToAB30(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ab30,

+               int dst_stride_ab30,

+               int width,

+               int height) {

+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,

+                          src_stride_u, dst_ab30, dst_stride_ab30,

+                          &kYvuH709Constants, width, height);

+}

+// Convert 10 bit YUV to ARGB with matrix

+static int I010ToARGBMatrix(const uint16_t* src_y,

+                            int src_stride_y,

+                            const uint16_t* src_u,

+                            int src_stride_u,

+                            const uint16_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_argb,

+                            int dst_stride_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width,

+                            int height) {

+  int y;

+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,

+                        const uint16_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I210ToARGBRow_C;

+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+#if defined(HAS_I210TOARGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

-      I444ToARGBRow = I444ToARGBRow_NEON;

+      I210ToARGBRow = I210ToARGBRow_SSSE3;

 #endif

+#if defined(HAS_I210TOARGBROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I210ToARGBRow = I210ToARGBRow_AVX2;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);

+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

-    src_u += src_stride_u;

-    src_v += src_stride_v;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

   return 0;

-// Convert I444 to ARGB.

+// Convert I010 to ARGB.

 LIBYUV_API

-int I444ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I444ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvI601Constants,

-                          width, height);

+int I010ToARGB(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvI601Constants, width, height);

-// Convert I444 to ABGR.

+// Convert I010 to ABGR.

 LIBYUV_API

-int I444ToABGR(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_abgr, int dst_stride_abgr,

-               int width, int height) {

-  return I444ToARGBMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_abgr, dst_stride_abgr,

+int I010ToABGR(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

                           &kYvuI601Constants,  // Use Yvu matrix

                           width, height);

-// Convert J444 to ARGB.

+// Convert H010 to ARGB.

 LIBYUV_API

-int J444ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return I444ToARGBMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_argb, dst_stride_argb,

-                          &kYuvJPEGConstants,

-                          width, height);

+int H010ToARGB(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvH709Constants, width, height);

-// Convert I411 to ARGB.

+// Convert H010 to ABGR.

 LIBYUV_API

-int I411ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int H010ToABGR(const uint16_t* src_y,

+               int src_stride_y,

+               const uint16_t* src_u,

+               int src_stride_u,

+               const uint16_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

+                          &kYvuH709Constants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert I444 to ARGB with matrix

+static int I444ToARGBMatrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_u,

+                            int src_stride_u,

+                            const uint8_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_argb,

+                            int dst_stride_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width,

+                            int height) {

   int y;

-  void (*I411ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = I411ToARGBRow_C;

-  if (!src_y || !src_u || !src_v ||

-      !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I444ToARGBRow_C;

+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -513,41 +687,47 @@

     dst_stride_argb = -dst_stride_argb;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 4 == width &&

-      src_stride_v * 4 == width &&

+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&

       dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;

-#if defined(HAS_I411TOARGBROW_SSSE3)

+#if defined(HAS_I444TOARGBROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;

+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

-      I411ToARGBRow = I411ToARGBRow_SSSE3;

+      I444ToARGBRow = I444ToARGBRow_SSSE3;

 #endif

-#if defined(HAS_I411TOARGBROW_AVX2)

+#if defined(HAS_I444TOARGBROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-    I411ToARGBRow = I411ToARGBRow_Any_AVX2;

+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;

     if (IS_ALIGNED(width, 16)) {

-      I411ToARGBRow = I411ToARGBRow_AVX2;

+      I444ToARGBRow = I444ToARGBRow_AVX2;

 #endif

-#if defined(HAS_I411TOARGBROW_NEON)

+#if defined(HAS_I444TOARGBROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    I411ToARGBRow = I411ToARGBRow_Any_NEON;

+    I444ToARGBRow = I444ToARGBRow_Any_NEON;

     if (IS_ALIGNED(width, 8)) {

-      I411ToARGBRow = I411ToARGBRow_NEON;

+      I444ToARGBRow = I444ToARGBRow_NEON;

 #endif

+#if defined(HAS_I444TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I444ToARGBRow = I444ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I444ToARGBRow = I444ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);

+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

     src_u += src_stride_u;

@@ -556,26 +736,83 @@

   return 0;

+// Convert I444 to ARGB.

+LIBYUV_API

+int I444ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvI601Constants, width, height);

+}

+// Convert I444 to ABGR.

+LIBYUV_API

+int I444ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,

+                          &kYvuI601Constants,  // Use Yvu matrix

+                          width, height);

+}

+// Convert J444 to ARGB.

+LIBYUV_API

+int J444ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_argb, dst_stride_argb,

+                          &kYuvJPEGConstants, width, height);

+}

 // Convert I420 with Alpha to preattenuated ARGB.

-static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,

-                                 const uint8* src_u, int src_stride_u,

-                                 const uint8* src_v, int src_stride_v,

-                                 const uint8* src_a, int src_stride_a,

-                                 uint8* dst_argb, int dst_stride_argb,

+static int I420AlphaToARGBMatrix(const uint8_t* src_y,

+                                 int src_stride_y,

+                                 const uint8_t* src_u,

+                                 int src_stride_u,

+                                 const uint8_t* src_v,

+                                 int src_stride_v,

+                                 const uint8_t* src_a,

+                                 int src_stride_a,

+                                 uint8_t* dst_argb,

+                                 int dst_stride_argb,

                                  const struct YuvConstants* yuvconstants,

-                                 int width, int height, int attenuate) {

+                                 int width,

+                                 int height,

+                                 int attenuate) {

   int y;

-  void (*I422AlphaToARGBRow)(const uint8* y_buf,

-                             const uint8* u_buf,

-                             const uint8* v_buf,

-                             const uint8* a_buf,

-                             uint8* dst_argb,

+  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                             const uint8_t* v_buf, const uint8_t* a_buf,

+                             uint8_t* dst_argb,

                              const struct YuvConstants* yuvconstants,

                              int width) = I422AlphaToARGBRow_C;

-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,

+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,

                            int width) = ARGBAttenuateRow_C;

-  if (!src_y || !src_u || !src_v || !dst_argb ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -608,13 +845,12 @@

 #endif

-#if defined(HAS_I422ALPHATOARGBROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;

+#if defined(HAS_I422ALPHATOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;

+    }

 #endif

 #if defined(HAS_ARGBATTENUATEROW_SSSE3)

@@ -641,6 +877,14 @@

 #endif

+#if defined(HAS_ARGBATTENUATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,

@@ -661,49 +905,59 @@

 // Convert I420 with Alpha to ARGB.

 LIBYUV_API

-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,

-                    const uint8* src_u, int src_stride_u,

-                    const uint8* src_v, int src_stride_v,

-                    const uint8* src_a, int src_stride_a,

-                    uint8* dst_argb, int dst_stride_argb,

-                    int width, int height, int attenuate) {

-  return I420AlphaToARGBMatrix(src_y, src_stride_y,

-                               src_u, src_stride_u,

-                               src_v, src_stride_v,

-                               src_a, src_stride_a,

-                               dst_argb, dst_stride_argb,

-                               &kYuvI601Constants,

-                               width, height, attenuate);

+int I420AlphaToARGB(const uint8_t* src_y,

+                    int src_stride_y,

+                    const uint8_t* src_u,

+                    int src_stride_u,

+                    const uint8_t* src_v,

+                    int src_stride_v,

+                    const uint8_t* src_a,

+                    int src_stride_a,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    int width,

+                    int height,

+                    int attenuate) {

+  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                               src_stride_v, src_a, src_stride_a, dst_argb,

+                               dst_stride_argb, &kYuvI601Constants, width,

+                               height, attenuate);

 // Convert I420 with Alpha to ABGR.

 LIBYUV_API

-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,

-                    const uint8* src_u, int src_stride_u,

-                    const uint8* src_v, int src_stride_v,

-                    const uint8* src_a, int src_stride_a,

-                    uint8* dst_abgr, int dst_stride_abgr,

-                    int width, int height, int attenuate) {

-  return I420AlphaToARGBMatrix(src_y, src_stride_y,

-                               src_v, src_stride_v,  // Swap U and V

-                               src_u, src_stride_u,

-                               src_a, src_stride_a,

-                               dst_abgr, dst_stride_abgr,

-                               &kYvuI601Constants,  // Use Yvu matrix

-                               width, height, attenuate);

+int I420AlphaToABGR(const uint8_t* src_y,

+                    int src_stride_y,

+                    const uint8_t* src_u,

+                    int src_stride_u,

+                    const uint8_t* src_v,

+                    int src_stride_v,

+                    const uint8_t* src_a,

+                    int src_stride_a,

+                    uint8_t* dst_abgr,

+                    int dst_stride_abgr,

+                    int width,

+                    int height,

+                    int attenuate) {

+  return I420AlphaToARGBMatrix(

+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V

+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,

+      &kYvuI601Constants,  // Use Yvu matrix

+      width, height, attenuate);

 // Convert I400 to ARGB.

 LIBYUV_API

-int I400ToARGB(const uint8* src_y, int src_stride_y,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int I400ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*I400ToARGBRow)(const uint8* y_buf,

-                     uint8* rgb_buf,

-                     int width) = I400ToARGBRow_C;

-  if (!src_y || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =

+      I400ToARGBRow_C;

+  if (!src_y || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -713,8 +967,7 @@

     dst_stride_argb = -dst_stride_argb;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_y == width && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_y = dst_stride_argb = 0;

@@ -743,6 +996,14 @@

 #endif

+#if defined(HAS_I400TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I400ToARGBRow = I400ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      I400ToARGBRow = I400ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     I400ToARGBRow(src_y, dst_argb, width);

@@ -754,14 +1015,16 @@

 // Convert J400 to ARGB.

 LIBYUV_API

-int J400ToARGB(const uint8* src_y, int src_stride_y,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int J400ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =

+  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =

       J400ToARGBRow_C;

-  if (!src_y || !dst_argb ||

-      width <= 0 || height == 0) {

+  if (!src_y || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -771,8 +1034,7 @@

     src_stride_y = -src_stride_y;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_y == width && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_y = dst_stride_argb = 0;

@@ -801,6 +1063,14 @@

 #endif

+#if defined(HAS_J400TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    J400ToARGBRow = J400ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      J400ToARGBRow = J400ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     J400ToARGBRow(src_y, dst_argb, width);

     src_y += src_stride_y;

@@ -810,85 +1080,89 @@

 // Shuffle table for converting BGRA to ARGB.

-static uvec8 kShuffleMaskBGRAToARGB = {

-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u

-};

+static const uvec8 kShuffleMaskBGRAToARGB = {

+    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};

 // Shuffle table for converting ABGR to ARGB.

-static uvec8 kShuffleMaskABGRToARGB = {

-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u

-};

+static const uvec8 kShuffleMaskABGRToARGB = {

+    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};

 // Shuffle table for converting RGBA to ARGB.

-static uvec8 kShuffleMaskRGBAToARGB = {

-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u

-};

+static const uvec8 kShuffleMaskRGBAToARGB = {

+    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};

 // Convert BGRA to ARGB.

 LIBYUV_API

-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return ARGBShuffle(src_bgra, src_stride_bgra,

-                     dst_argb, dst_stride_argb,

-                     (const uint8*)(&kShuffleMaskBGRAToARGB),

-                     width, height);

+int BGRAToARGB(const uint8_t* src_bgra,

+               int src_stride_bgra,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,

+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);

 // Convert ARGB to BGRA (same as BGRAToARGB).

 LIBYUV_API

-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return ARGBShuffle(src_bgra, src_stride_bgra,

-                     dst_argb, dst_stride_argb,

-                     (const uint8*)(&kShuffleMaskBGRAToARGB),

-                     width, height);

+int ARGBToBGRA(const uint8_t* src_bgra,

+               int src_stride_bgra,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,

+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);

 // Convert ABGR to ARGB.

 LIBYUV_API

-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return ARGBShuffle(src_abgr, src_stride_abgr,

-                     dst_argb, dst_stride_argb,

-                     (const uint8*)(&kShuffleMaskABGRToARGB),

-                     width, height);

+int ABGRToARGB(const uint8_t* src_abgr,

+               int src_stride_abgr,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,

+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);

 // Convert ARGB to ABGR to (same as ABGRToARGB).

 LIBYUV_API

-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return ARGBShuffle(src_abgr, src_stride_abgr,

-                     dst_argb, dst_stride_argb,

-                     (const uint8*)(&kShuffleMaskABGRToARGB),

-                     width, height);

+int ARGBToABGR(const uint8_t* src_abgr,

+               int src_stride_abgr,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,

+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);

 // Convert RGBA to ARGB.

 LIBYUV_API

-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

-  return ARGBShuffle(src_rgba, src_stride_rgba,

-                     dst_argb, dst_stride_argb,

-                     (const uint8*)(&kShuffleMaskRGBAToARGB),

-                     width, height);

+int RGBAToARGB(const uint8_t* src_rgba,

+               int src_stride_rgba,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,

+                     (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);

 // Convert RGB24 to ARGB.

 LIBYUV_API

-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,

-                uint8* dst_argb, int dst_stride_argb,

-                int width, int height) {

+int RGB24ToARGB(const uint8_t* src_rgb24,

+                int src_stride_rgb24,

+                uint8_t* dst_argb,

+                int dst_stride_argb,

+                int width,

+                int height) {

   int y;

-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =

       RGB24ToARGBRow_C;

-  if (!src_rgb24 || !dst_argb ||

-      width <= 0 || height == 0) {

+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -898,8 +1172,7 @@

     src_stride_rgb24 = -src_stride_rgb24;

   // Coalesce rows.

-  if (src_stride_rgb24 == width * 3 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_rgb24 = dst_stride_argb = 0;

@@ -920,6 +1193,14 @@

 #endif

+#if defined(HAS_RGB24TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RGB24ToARGBRow = RGB24ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     RGB24ToARGBRow(src_rgb24, dst_argb, width);

@@ -931,14 +1212,16 @@

 // Convert RAW to ARGB.

 LIBYUV_API

-int RAWToARGB(const uint8* src_raw, int src_stride_raw,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height) {

+int RAWToARGB(const uint8_t* src_raw,

+              int src_stride_raw,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height) {

   int y;

-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =

+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =

       RAWToARGBRow_C;

-  if (!src_raw || !dst_argb ||

-      width <= 0 || height == 0) {

+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -948,8 +1231,7 @@

     src_stride_raw = -src_stride_raw;

   // Coalesce rows.

-  if (src_stride_raw == width * 3 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_raw = dst_stride_argb = 0;

@@ -970,6 +1252,14 @@

 #endif

+#if defined(HAS_RAWTOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RAWToARGBRow = RAWToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RAWToARGBRow = RAWToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     RAWToARGBRow(src_raw, dst_argb, width);

@@ -981,14 +1271,16 @@

 // Convert RGB565 to ARGB.

 LIBYUV_API

-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,

-                 uint8* dst_argb, int dst_stride_argb,

-                 int width, int height) {

+int RGB565ToARGB(const uint8_t* src_rgb565,

+                 int src_stride_rgb565,

+                 uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int width,

+                 int height) {

   int y;

-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =

-      RGB565ToARGBRow_C;

-  if (!src_rgb565 || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,

+                          int width) = RGB565ToARGBRow_C;

+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -998,8 +1290,7 @@

     src_stride_rgb565 = -src_stride_rgb565;

   // Coalesce rows.

-  if (src_stride_rgb565 == width * 2 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_rgb565 = dst_stride_argb = 0;

@@ -1028,6 +1319,14 @@

 #endif

+#if defined(HAS_RGB565TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RGB565ToARGBRow = RGB565ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     RGB565ToARGBRow(src_rgb565, dst_argb, width);

@@ -1039,14 +1338,16 @@

 // Convert ARGB1555 to ARGB.

 LIBYUV_API

-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,

-                   uint8* dst_argb, int dst_stride_argb,

-                   int width, int height) {

+int ARGB1555ToARGB(const uint8_t* src_argb1555,

+                   int src_stride_argb1555,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   int width,

+                   int height) {

   int y;

-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,

-      int width) = ARGB1555ToARGBRow_C;

-  if (!src_argb1555 || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,

+                            int width) = ARGB1555ToARGBRow_C;

+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1056,8 +1357,7 @@

     src_stride_argb1555 = -src_stride_argb1555;

   // Coalesce rows.

-  if (src_stride_argb1555 == width * 2 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb1555 = dst_stride_argb = 0;

@@ -1086,6 +1386,14 @@

 #endif

+#if defined(HAS_ARGB1555TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGB1555ToARGBRow(src_argb1555, dst_argb, width);

@@ -1097,14 +1405,16 @@

 // Convert ARGB4444 to ARGB.

 LIBYUV_API

-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,

-                   uint8* dst_argb, int dst_stride_argb,

-                   int width, int height) {

+int ARGB4444ToARGB(const uint8_t* src_argb4444,

+                   int src_stride_argb4444,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   int width,

+                   int height) {

   int y;

-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,

-      int width) = ARGB4444ToARGBRow_C;

-  if (!src_argb4444 || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,

+                            int width) = ARGB4444ToARGBRow_C;

+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1114,8 +1424,7 @@

     src_stride_argb4444 = -src_stride_argb4444;

   // Coalesce rows.

-  if (src_stride_argb4444 == width * 2 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb4444 = dst_stride_argb = 0;

@@ -1144,6 +1453,14 @@

 #endif

+#if defined(HAS_ARGB4444TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGB4444ToARGBRow(src_argb4444, dst_argb, width);

@@ -1153,25 +1470,122 @@

   return 0;

-// Convert NV12 to ARGB.

+// Convert AR30 to ARGB.

 LIBYUV_API

-int NV12ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_uv, int src_stride_uv,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int AR30ToARGB(const uint8_t* src_ar30,

+               int src_stride_ar30,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*NV12ToARGBRow)(const uint8* y_buf,

-                        const uint8* uv_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = NV12ToARGBRow_C;

-  if (!src_y || !src_uv || !dst_argb ||

-      width <= 0 || height == 0) {

+  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;

+    src_stride_ar30 = -src_stride_ar30;

+  }

+  // Coalesce rows.

+  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_ar30 = dst_stride_argb = 0;

+  }

+  for (y = 0; y < height; ++y) {

+    AR30ToARGBRow_C(src_ar30, dst_argb, width);

+    src_ar30 += src_stride_ar30;

+    dst_argb += dst_stride_argb;

+  }

+  return 0;

+}

+// Convert AR30 to ABGR.

+LIBYUV_API

+int AR30ToABGR(const uint8_t* src_ar30,

+               int src_stride_ar30,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  int y;

+  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;

+    src_stride_ar30 = -src_stride_ar30;

+  }

+  // Coalesce rows.

+  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_ar30 = dst_stride_abgr = 0;

+  }

+  for (y = 0; y < height; ++y) {

+    AR30ToABGRRow_C(src_ar30, dst_abgr, width);

+    src_ar30 += src_stride_ar30;

+    dst_abgr += dst_stride_abgr;

+  }

+  return 0;

+}

+// Convert AR30 to AB30.

+LIBYUV_API

+int AR30ToAB30(const uint8_t* src_ar30,

+               int src_stride_ar30,

+               uint8_t* dst_ab30,

+               int dst_stride_ab30,

+               int width,

+               int height) {

+  int y;

+  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;

+    src_stride_ar30 = -src_stride_ar30;

+  }

+  // Coalesce rows.

+  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_ar30 = dst_stride_ab30 = 0;

+  }

+  for (y = 0; y < height; ++y) {

+    AR30ToAB30Row_C(src_ar30, dst_ab30, width);

+    src_ar30 += src_stride_ar30;

+    dst_ab30 += dst_stride_ab30;

+  }

+  return 0;

+}

+// Convert NV12 to ARGB with matrix

+static int NV12ToARGBMatrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_uv,

+                            int src_stride_uv,

+                            uint8_t* dst_argb,

+                            int dst_stride_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width,

+                            int height) {

+  int y;

+  void (*NV12ToARGBRow)(

+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,

+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;

+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

     dst_argb = dst_argb + (height - 1) * dst_stride_argb;

     dst_stride_argb = -dst_stride_argb;

@@ -1199,9 +1613,17 @@

 #endif

+#if defined(HAS_NV12TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToARGBRow = NV12ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

-    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);

+    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

     if (y & 1) {

@@ -1211,20 +1633,21 @@

   return 0;

-// Convert NV21 to ARGB.

-LIBYUV_API

-int NV21ToARGB(const uint8* src_y, int src_stride_y,

-               const uint8* src_uv, int src_stride_uv,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+// Convert NV21 to ARGB with matrix

+static int NV21ToARGBMatrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_vu,

+                            int src_stride_vu,

+                            uint8_t* dst_argb,

+                            int dst_stride_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width,

+                            int height) {

   int y;

-  void (*NV21ToARGBRow)(const uint8* y_buf,

-                        const uint8* uv_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = NV21ToARGBRow_C;

-  if (!src_y || !src_uv || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*NV21ToARGBRow)(

+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,

+      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;

+  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1257,12 +1680,137 @@

 #endif

+#if defined(HAS_NV21TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      NV21ToARGBRow = NV21ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

-    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);

+    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);

     dst_argb += dst_stride_argb;

     src_y += src_stride_y;

     if (y & 1) {

+      src_vu += src_stride_vu;

+    }

+  }

+  return 0;

+}

+// Convert NV12 to ARGB.

+LIBYUV_API

+int NV12ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_uv,

+               int src_stride_uv,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,

+                          dst_stride_argb, &kYuvI601Constants, width, height);

+}

+// Convert NV21 to ARGB.

+LIBYUV_API

+int NV21ToARGB(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_vu,

+               int src_stride_vu,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

+  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,

+                          dst_stride_argb, &kYuvI601Constants, width, height);

+}

+// Convert NV12 to ABGR.

+// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.

+// To swap the UV use NV12 instead of NV21.LIBYUV_API

+int NV12ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_uv,

+               int src_stride_uv,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,

+                          dst_stride_abgr, &kYvuI601Constants, width, height);

+}

+// Convert NV21 to ABGR.

+LIBYUV_API

+int NV21ToABGR(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_vu,

+               int src_stride_vu,

+               uint8_t* dst_abgr,

+               int dst_stride_abgr,

+               int width,

+               int height) {

+  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,

+                          dst_stride_abgr, &kYvuI601Constants, width, height);

+}

+// TODO(fbarchard): Consider SSSE3 2 step conversion.

+// Convert NV12 to RGB24 with matrix

+static int NV12ToRGB24Matrix(const uint8_t* src_y,

+                             int src_stride_y,

+                             const uint8_t* src_uv,

+                             int src_stride_uv,

+                             uint8_t* dst_rgb24,

+                             int dst_stride_rgb24,

+                             const struct YuvConstants* yuvconstants,

+                             int width,

+                             int height) {

+  int y;

+  void (*NV12ToRGB24Row)(

+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,

+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;

+  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;

+    dst_stride_rgb24 = -dst_stride_rgb24;

+  }

+#if defined(HAS_NV12TORGB24ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToRGB24Row = NV12ToRGB24Row_NEON;

+    }

+  }

+#endif

+#if defined(HAS_NV12TORGB24ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_NV12TORGB24ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      NV12ToRGB24Row = NV12ToRGB24Row_AVX2;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);

+    dst_rgb24 += dst_stride_rgb24;

+    src_y += src_stride_y;

+    if (y & 1) {

       src_uv += src_stride_uv;

@@ -1269,19 +1817,109 @@

   return 0;

+// Convert NV21 to RGB24 with matrix

+static int NV21ToRGB24Matrix(const uint8_t* src_y,

+                             int src_stride_y,

+                             const uint8_t* src_vu,

+                             int src_stride_vu,

+                             uint8_t* dst_rgb24,

+                             int dst_stride_rgb24,

+                             const struct YuvConstants* yuvconstants,

+                             int width,

+                             int height) {

+  int y;

+  void (*NV21ToRGB24Row)(

+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,

+      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;

+  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;

+    dst_stride_rgb24 = -dst_stride_rgb24;

+  }

+#if defined(HAS_NV21TORGB24ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      NV21ToRGB24Row = NV21ToRGB24Row_NEON;

+    }

+  }

+#endif

+#if defined(HAS_NV21TORGB24ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_NV21TORGB24ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      NV21ToRGB24Row = NV21ToRGB24Row_AVX2;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);

+    dst_rgb24 += dst_stride_rgb24;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_vu += src_stride_vu;

+    }

+  }

+  return 0;

+}

+// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix.

+// Convert NV12 to RGB24.

+LIBYUV_API

+int NV12ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_uv,

+                int src_stride_uv,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height) {

+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,

+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,

+                           width, height);

+}

+// Convert NV21 to RGB24.

+LIBYUV_API

+int NV21ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_vu,

+                int src_stride_vu,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height) {

+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,

+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,

+                           width, height);

+}

 // Convert M420 to ARGB.

 LIBYUV_API

-int M420ToARGB(const uint8* src_m420, int src_stride_m420,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int M420ToARGB(const uint8_t* src_m420,

+               int src_stride_m420,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*NV12ToARGBRow)(const uint8* y_buf,

-                        const uint8* uv_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = NV12ToARGBRow_C;

-  if (!src_m420 || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*NV12ToARGBRow)(

+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,

+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;

+  if (!src_m420 || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1314,6 +1952,14 @@

 #endif

+#if defined(HAS_NV12TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToARGBRow = NV12ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,

@@ -1332,17 +1978,17 @@

 // Convert YUY2 to ARGB.

 LIBYUV_API

-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int YUY2ToARGB(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*YUY2ToARGBRow)(const uint8* src_yuy2,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) =

+  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,

+                        const struct YuvConstants* yuvconstants, int width) =

       YUY2ToARGBRow_C;

-  if (!src_yuy2 || !dst_argb ||

-      width <= 0 || height == 0) {

+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1352,8 +1998,7 @@

     src_stride_yuy2 = -src_stride_yuy2;

   // Coalesce rows.

-  if (src_stride_yuy2 == width * 2 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_yuy2 = dst_stride_argb = 0;

@@ -1382,6 +2027,14 @@

 #endif

+#if defined(HAS_YUY2TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      YUY2ToARGBRow = YUY2ToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);

     src_yuy2 += src_stride_yuy2;

@@ -1392,17 +2045,17 @@

 // Convert UYVY to ARGB.

 LIBYUV_API

-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int UYVYToARGB(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*UYVYToARGBRow)(const uint8* src_uyvy,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) =

+  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,

+                        const struct YuvConstants* yuvconstants, int width) =

       UYVYToARGBRow_C;

-  if (!src_uyvy || !dst_argb ||

-      width <= 0 || height == 0) {

+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1412,8 +2065,7 @@

     src_stride_uyvy = -src_stride_uyvy;

   // Coalesce rows.

-  if (src_stride_uyvy == width * 2 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_uyvy = dst_stride_argb = 0;

@@ -1442,6 +2094,14 @@

 #endif

+#if defined(HAS_UYVYTOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      UYVYToARGBRow = UYVYToARGBRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);

     src_uyvy += src_stride_uyvy;

@@ -1448,6 +2108,121 @@

     dst_argb += dst_stride_argb;

   return 0;

+}

+static void WeavePixels(const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        int src_pixel_stride_uv,

+                        uint8_t* dst_uv,

+                        int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    dst_uv[0] = *src_u;

+    dst_uv[1] = *src_v;

+    dst_uv += 2;

+    src_u += src_pixel_stride_uv;

+    src_v += src_pixel_stride_uv;

+  }

+}

+// Convert Android420 to ARGB.

+LIBYUV_API

+int Android420ToARGBMatrix(const uint8_t* src_y,

+                           int src_stride_y,

+                           const uint8_t* src_u,

+                           int src_stride_u,

+                           const uint8_t* src_v,

+                           int src_stride_v,

+                           int src_pixel_stride_uv,

+                           uint8_t* dst_argb,

+                           int dst_stride_argb,

+                           const struct YuvConstants* yuvconstants,

+                           int width,

+                           int height) {

+  int y;

+  uint8_t* dst_uv;

+  const ptrdiff_t vu_off = src_v - src_u;

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;

+    dst_stride_argb = -dst_stride_argb;

+  }

+  // I420

+  if (src_pixel_stride_uv == 1) {

+    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                            src_stride_v, dst_argb, dst_stride_argb,

+                            yuvconstants, width, height);

+    // NV21

+  }

+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&

+      src_stride_u == src_stride_v) {

+    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,

+                            dst_stride_argb, yuvconstants, width, height);

+    // NV12

+  }

+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {

+    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,

+                            dst_stride_argb, yuvconstants, width, height);

+  }

+  // General case fallback creates NV12

+  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);

+  dst_uv = plane_uv;

+  for (y = 0; y < halfheight; ++y) {

+    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+    dst_uv += halfwidth * 2;

+  }

+  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,

+                   dst_stride_argb, yuvconstants, width, height);

+  free_aligned_buffer_64(plane_uv);

+  return 0;

+}

+// Convert Android420 to ARGB.

+LIBYUV_API

+int Android420ToARGB(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_u,

+                     int src_stride_u,

+                     const uint8_t* src_v,

+                     int src_stride_v,

+                     int src_pixel_stride_uv,

+                     uint8_t* dst_argb,

+                     int dst_stride_argb,

+                     int width,

+                     int height) {

+  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                                src_stride_v, src_pixel_stride_uv, dst_argb,

+                                dst_stride_argb, &kYuvI601Constants, width,

+                                height);

+}

+// Convert Android420 to ABGR.

+LIBYUV_API

+int Android420ToABGR(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_u,

+                     int src_stride_u,

+                     const uint8_t* src_v,

+                     int src_stride_v,

+                     int src_pixel_stride_uv,

+                     uint8_t* dst_abgr,

+                     int dst_stride_abgr,

+                     int width,

+                     int height) {

+  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,

+                                src_stride_u, src_pixel_stride_uv, dst_abgr,

+                                dst_stride_abgr, &kYvuI601Constants, width,

+                                height);

 #ifdef __cplusplus

--- a/third_party/libyuv/source/convert_from.cc

+++ b/third_party/libyuv/source/convert_from.cc

@@ -15,9 +15,9 @@

 #include "libyuv/cpu_id.h"

 #include "libyuv/planar_functions.h"

 #include "libyuv/rotate.h"

+#include "libyuv/row.h"

 #include "libyuv/scale.h"  // For ScalePlane()

 #include "libyuv/video_common.h"

-#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -30,109 +30,144 @@

 // I420 To any I4xx YUV format with mirroring.

-static int I420ToI4xx(const uint8* src_y, int src_stride_y,

-                      const uint8* src_u, int src_stride_u,

-                      const uint8* src_v, int src_stride_v,

-                      uint8* dst_y, int dst_stride_y,

-                      uint8* dst_u, int dst_stride_u,

-                      uint8* dst_v, int dst_stride_v,

-                      int src_y_width, int src_y_height,

-                      int dst_uv_width, int dst_uv_height) {

+static int I420ToI4xx(const uint8_t* src_y,

+                      int src_stride_y,

+                      const uint8_t* src_u,

+                      int src_stride_u,

+                      const uint8_t* src_v,

+                      int src_stride_v,

+                      uint8_t* dst_y,

+                      int dst_stride_y,

+                      uint8_t* dst_u,

+                      int dst_stride_u,

+                      uint8_t* dst_v,

+                      int dst_stride_v,

+                      int src_y_width,

+                      int src_y_height,

+                      int dst_uv_width,

+                      int dst_uv_height) {

   const int dst_y_width = Abs(src_y_width);

   const int dst_y_height = Abs(src_y_height);

   const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);

   const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);

-  if (src_y_width == 0 || src_y_height == 0 ||

-      dst_uv_width <= 0 || dst_uv_height <= 0) {

+  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||

+      dst_uv_height <= 0) {

     return -1;

   if (dst_y) {

-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,

-               dst_y, dst_stride_y, dst_y_width, dst_y_height,

-               kFilterBilinear);

+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,

+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);

-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,

-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,

-             kFilterBilinear);

-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,

-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,

-             kFilterBilinear);

+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,

+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);

+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,

+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);

   return 0;

+// Convert 8 bit YUV to 10 bit.

+LIBYUV_API

+int I420ToI010(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint16_t* dst_y,

+               int dst_stride_y,

+               uint16_t* dst_u,

+               int dst_stride_u,

+               uint16_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

+  int halfwidth = (width + 1) >> 1;

+  int halfheight = (height + 1) >> 1;

+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    halfheight = (height + 1) >> 1;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_u = src_u + (halfheight - 1) * src_stride_u;

+    src_v = src_v + (halfheight - 1) * src_stride_v;

+    src_stride_y = -src_stride_y;

+    src_stride_u = -src_stride_u;

+    src_stride_v = -src_stride_v;

+  }

+  // Convert Y plane.

+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,

+                    height);

+  // Convert UV planes.

+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,

+                    halfheight);

+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,

+                    halfheight);

+  return 0;

+}

 // 420 chroma is 1/2 width, 1/2 height

 // 422 chroma is 1/2 width, 1x height

 LIBYUV_API

-int I420ToI422(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int I420ToI422(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   const int dst_uv_width = (Abs(width) + 1) >> 1;

   const int dst_uv_height = Abs(height);

-  return I420ToI4xx(src_y, src_stride_y,

-                    src_u, src_stride_u,

-                    src_v, src_stride_v,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

-                    width, height,

-                    dst_uv_width, dst_uv_height);

+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,

+                    dst_v, dst_stride_v, width, height, dst_uv_width,

+                    dst_uv_height);

 // 420 chroma is 1/2 width, 1/2 height

 // 444 chroma is 1x width, 1x height

 LIBYUV_API

-int I420ToI444(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int I420ToI444(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   const int dst_uv_width = Abs(width);

   const int dst_uv_height = Abs(height);

-  return I420ToI4xx(src_y, src_stride_y,

-                    src_u, src_stride_u,

-                    src_v, src_stride_v,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

-                    width, height,

-                    dst_uv_width, dst_uv_height);

+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,

+                    dst_v, dst_stride_v, width, height, dst_uv_width,

+                    dst_uv_height);

-// 420 chroma is 1/2 width, 1/2 height

-// 411 chroma is 1/4 width, 1x height

-LIBYUV_API

-int I420ToI411(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

-  const int dst_uv_width = (Abs(width) + 3) >> 2;

-  const int dst_uv_height = Abs(height);

-  return I420ToI4xx(src_y, src_stride_y,

-                    src_u, src_stride_u,

-                    src_v, src_stride_v,

-                    dst_y, dst_stride_y,

-                    dst_u, dst_stride_u,

-                    dst_v, dst_stride_v,

-                    width, height,

-                    dst_uv_width, dst_uv_height);

-}

 // Copy to I400. Source can be I420,422,444,400,NV12,NV21

 LIBYUV_API

-int I400Copy(const uint8* src_y, int src_stride_y,

-             uint8* dst_y, int dst_stride_y,

-             int width, int height) {

-  if (!src_y || !dst_y ||

-      width <= 0 || height == 0) {

+int I400Copy(const uint8_t* src_y,

+             int src_stride_y,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             int width,

+             int height) {

+  if (!src_y || !dst_y || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -146,17 +181,21 @@

 LIBYUV_API

-int I422ToYUY2(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_yuy2, int dst_stride_yuy2,

-               int width, int height) {

+int I422ToYUY2(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_yuy2,

+               int dst_stride_yuy2,

+               int width,

+               int height) {

   int y;

-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,

-                        const uint8* src_v, uint8* dst_yuy2, int width) =

+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,

+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =

       I422ToYUY2Row_C;

-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -166,10 +205,8 @@

     dst_stride_yuy2 = -dst_stride_yuy2;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 2 == width &&

-      src_stride_v * 2 == width &&

-      dst_stride_yuy2 == width * 2) {

+  if (src_stride_y == width && src_stride_u * 2 == width &&

+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {

     width *= height;

     height = 1;

     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;

@@ -182,6 +219,14 @@

 #endif

+#if defined(HAS_I422TOYUY2ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToYUY2Row = I422ToYUY2Row_AVX2;

+    }

+  }

+#endif

 #if defined(HAS_I422TOYUY2ROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;

@@ -202,17 +247,21 @@

 LIBYUV_API

-int I420ToYUY2(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_yuy2, int dst_stride_yuy2,

-               int width, int height) {

+int I420ToYUY2(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_yuy2,

+               int dst_stride_yuy2,

+               int width,

+               int height) {

   int y;

-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,

-                        const uint8* src_v, uint8* dst_yuy2, int width) =

+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,

+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =

       I422ToYUY2Row_C;

-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -229,6 +278,14 @@

 #endif

+#if defined(HAS_I422TOYUY2ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToYUY2Row = I422ToYUY2Row_AVX2;

+    }

+  }

+#endif

 #if defined(HAS_I422TOYUY2ROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;

@@ -237,6 +294,14 @@

 #endif

+#if defined(HAS_I422TOYUY2ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToYUY2Row = I422ToYUY2Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);

@@ -254,17 +319,21 @@

 LIBYUV_API

-int I422ToUYVY(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_uyvy, int dst_stride_uyvy,

-               int width, int height) {

+int I422ToUYVY(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_uyvy,

+               int dst_stride_uyvy,

+               int width,

+               int height) {

   int y;

-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,

-                        const uint8* src_v, uint8* dst_uyvy, int width) =

+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,

+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =

       I422ToUYVYRow_C;

-  if (!src_y || !src_u || !src_v || !dst_uyvy ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -274,10 +343,8 @@

     dst_stride_uyvy = -dst_stride_uyvy;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      src_stride_u * 2 == width &&

-      src_stride_v * 2 == width &&

-      dst_stride_uyvy == width * 2) {

+  if (src_stride_y == width && src_stride_u * 2 == width &&

+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {

     width *= height;

     height = 1;

     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;

@@ -290,6 +357,14 @@

 #endif

+#if defined(HAS_I422TOUYVYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToUYVYRow = I422ToUYVYRow_AVX2;

+    }

+  }

+#endif

 #if defined(HAS_I422TOUYVYROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;

@@ -298,6 +373,14 @@

 #endif

+#if defined(HAS_I422TOUYVYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToUYVYRow = I422ToUYVYRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);

@@ -310,17 +393,21 @@

 LIBYUV_API

-int I420ToUYVY(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_uyvy, int dst_stride_uyvy,

-               int width, int height) {

+int I420ToUYVY(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_uyvy,

+               int dst_stride_uyvy,

+               int width,

+               int height) {

   int y;

-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,

-                        const uint8* src_v, uint8* dst_uyvy, int width) =

+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,

+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =

       I422ToUYVYRow_C;

-  if (!src_y || !src_u || !src_v || !dst_uyvy ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -337,6 +424,14 @@

 #endif

+#if defined(HAS_I422TOUYVYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToUYVYRow = I422ToUYVYRow_AVX2;

+    }

+  }

+#endif

 #if defined(HAS_I422TOUYVYROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;

@@ -345,6 +440,14 @@

 #endif

+#if defined(HAS_I422TOUYVYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToUYVYRow = I422ToUYVYRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);

@@ -363,14 +466,20 @@

 // TODO(fbarchard): test negative height for invert.

 LIBYUV_API

-int I420ToNV12(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height) {

-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||

-      width <= 0 || height == 0) {

+int I420ToNV12(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height) {

+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||

+      height == 0) {

     return -1;

   int halfwidth = (width + 1) / 2;

@@ -378,44 +487,47 @@

   if (dst_y) {

     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

-  MergeUVPlane(src_u, src_stride_u,

-               src_v, src_stride_v,

-               dst_uv, dst_stride_uv,

+  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,

                halfwidth, halfheight);

   return 0;

 LIBYUV_API

-int I420ToNV21(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_vu, int dst_stride_vu,

-               int width, int height) {

-  return I420ToNV12(src_y, src_stride_y,

-                    src_v, src_stride_v,

-                    src_u, src_stride_u,

-                    dst_y, dst_stride_y,

-                    dst_vu, dst_stride_vu,

+int I420ToNV21(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_vu,

+               int dst_stride_vu,

+               int width,

+               int height) {

+  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,

+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,

                     width, height);

 // Convert I422 to RGBA with matrix

-static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,

-                            const uint8* src_u, int src_stride_u,

-                            const uint8* src_v, int src_stride_v,

-                            uint8* dst_rgba, int dst_stride_rgba,

+static int I420ToRGBAMatrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_u,

+                            int src_stride_u,

+                            const uint8_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_rgba,

+                            int dst_stride_rgba,

                             const struct YuvConstants* yuvconstants,

-                            int width, int height) {

+                            int width,

+                            int height) {

   int y;

-  void (*I422ToRGBARow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = I422ToRGBARow_C;

-  if (!src_y || !src_u || !src_v || !dst_rgba ||

-      width <= 0 || height == 0) {

+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I422ToRGBARow_C;

+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -448,13 +560,12 @@

 #endif

-#if defined(HAS_I422TORGBAROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {

-    I422ToRGBARow = I422ToRGBARow_DSPR2;

+#if defined(HAS_I422TORGBAROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToRGBARow = I422ToRGBARow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGBARow = I422ToRGBARow_MSA;

+    }

 #endif

@@ -472,50 +583,58 @@

 // Convert I420 to RGBA.

 LIBYUV_API

-int I420ToRGBA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_rgba, int dst_stride_rgba,

-               int width, int height) {

-  return I420ToRGBAMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_rgba, dst_stride_rgba,

-                          &kYuvI601Constants,

-                          width, height);

+int I420ToRGBA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_rgba,

+               int dst_stride_rgba,

+               int width,

+               int height) {

+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_rgba, dst_stride_rgba,

+                          &kYuvI601Constants, width, height);

 // Convert I420 to BGRA.

 LIBYUV_API

-int I420ToBGRA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_bgra, int dst_stride_bgra,

-               int width, int height) {

-  return I420ToRGBAMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_bgra, dst_stride_bgra,

+int I420ToBGRA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_bgra,

+               int dst_stride_bgra,

+               int width,

+               int height) {

+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,

                           &kYvuI601Constants,  // Use Yvu matrix

                           width, height);

 // Convert I420 to RGB24 with matrix

-static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,

-                             const uint8* src_u, int src_stride_u,

-                             const uint8* src_v, int src_stride_v,

-                             uint8* dst_rgb24, int dst_stride_rgb24,

+static int I420ToRGB24Matrix(const uint8_t* src_y,

+                             int src_stride_y,

+                             const uint8_t* src_u,

+                             int src_stride_u,

+                             const uint8_t* src_v,

+                             int src_stride_v,

+                             uint8_t* dst_rgb24,

+                             int dst_stride_rgb24,

                              const struct YuvConstants* yuvconstants,

-                             int width, int height) {

+                             int width,

+                             int height) {

   int y;

-  void (*I422ToRGB24Row)(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* rgb_buf,

-                         const struct YuvConstants* yuvconstants,

-                         int width) = I422ToRGB24Row_C;

-  if (!src_y || !src_u || !src_v || !dst_rgb24 ||

-      width <= 0 || height == 0) {

+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,

+                         const uint8_t* v_buf, uint8_t* rgb_buf,

+                         const struct YuvConstants* yuvconstants, int width) =

+      I422ToRGB24Row_C;

+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -548,6 +667,14 @@

 #endif

+#if defined(HAS_I422TORGB24ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToRGB24Row = I422ToRGB24Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);

@@ -563,50 +690,95 @@

 // Convert I420 to RGB24.

 LIBYUV_API

-int I420ToRGB24(const uint8* src_y, int src_stride_y,

-                const uint8* src_u, int src_stride_u,

-                const uint8* src_v, int src_stride_v,

-                uint8* dst_rgb24, int dst_stride_rgb24,

-                int width, int height) {

-  return I420ToRGB24Matrix(src_y, src_stride_y,

-                           src_u, src_stride_u,

-                           src_v, src_stride_v,

-                           dst_rgb24, dst_stride_rgb24,

-                           &kYuvI601Constants,

-                           width, height);

+int I420ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_u,

+                int src_stride_u,

+                const uint8_t* src_v,

+                int src_stride_v,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height) {

+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                           src_stride_v, dst_rgb24, dst_stride_rgb24,

+                           &kYuvI601Constants, width, height);

 // Convert I420 to RAW.

 LIBYUV_API

-int I420ToRAW(const uint8* src_y, int src_stride_y,

-              const uint8* src_u, int src_stride_u,

-              const uint8* src_v, int src_stride_v,

-              uint8* dst_raw, int dst_stride_raw,

-              int width, int height) {

-  return I420ToRGB24Matrix(src_y, src_stride_y,

-                           src_v, src_stride_v,  // Swap U and V

-                           src_u, src_stride_u,

-                           dst_raw, dst_stride_raw,

+int I420ToRAW(const uint8_t* src_y,

+              int src_stride_y,

+              const uint8_t* src_u,

+              int src_stride_u,

+              const uint8_t* src_v,

+              int src_stride_v,

+              uint8_t* dst_raw,

+              int dst_stride_raw,

+              int width,

+              int height) {

+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,

+                           src_stride_v,  // Swap U and V

+                           src_u, src_stride_u, dst_raw, dst_stride_raw,

                            &kYvuI601Constants,  // Use Yvu matrix

                            width, height);

+// Convert H420 to RGB24.

+LIBYUV_API

+int H420ToRGB24(const uint8_t* src_y,

+                int src_stride_y,

+                const uint8_t* src_u,

+                int src_stride_u,

+                const uint8_t* src_v,

+                int src_stride_v,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height) {

+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                           src_stride_v, dst_rgb24, dst_stride_rgb24,

+                           &kYuvH709Constants, width, height);

+}

+// Convert H420 to RAW.

+LIBYUV_API

+int H420ToRAW(const uint8_t* src_y,

+              int src_stride_y,

+              const uint8_t* src_u,

+              int src_stride_u,

+              const uint8_t* src_v,

+              int src_stride_v,

+              uint8_t* dst_raw,

+              int dst_stride_raw,

+              int width,

+              int height) {

+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,

+                           src_stride_v,  // Swap U and V

+                           src_u, src_stride_u, dst_raw, dst_stride_raw,

+                           &kYvuH709Constants,  // Use Yvu matrix

+                           width, height);

+}

 // Convert I420 to ARGB1555.

 LIBYUV_API

-int I420ToARGB1555(const uint8* src_y, int src_stride_y,

-                   const uint8* src_u, int src_stride_u,

-                   const uint8* src_v, int src_stride_v,

-                   uint8* dst_argb1555, int dst_stride_argb1555,

-                   int width, int height) {

+int I420ToARGB1555(const uint8_t* src_y,

+                   int src_stride_y,

+                   const uint8_t* src_u,

+                   int src_stride_u,

+                   const uint8_t* src_v,

+                   int src_stride_v,

+                   uint8_t* dst_argb1555,

+                   int dst_stride_argb1555,

+                   int width,

+                   int height) {

   int y;

-  void (*I422ToARGB1555Row)(const uint8* y_buf,

-                            const uint8* u_buf,

-                            const uint8* v_buf,

-                            uint8* rgb_buf,

+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,

+                            const uint8_t* v_buf, uint8_t* rgb_buf,

                             const struct YuvConstants* yuvconstants,

                             int width) = I422ToARGB1555Row_C;

-  if (!src_y || !src_u || !src_v || !dst_argb1555 ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||

+      height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -639,6 +811,14 @@

 #endif

+#if defined(HAS_I422TOARGB1555ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,

@@ -653,23 +833,25 @@

   return 0;

 // Convert I420 to ARGB4444.

 LIBYUV_API

-int I420ToARGB4444(const uint8* src_y, int src_stride_y,

-                   const uint8* src_u, int src_stride_u,

-                   const uint8* src_v, int src_stride_v,

-                   uint8* dst_argb4444, int dst_stride_argb4444,

-                   int width, int height) {

+int I420ToARGB4444(const uint8_t* src_y,

+                   int src_stride_y,

+                   const uint8_t* src_u,

+                   int src_stride_u,

+                   const uint8_t* src_v,

+                   int src_stride_v,

+                   uint8_t* dst_argb4444,

+                   int dst_stride_argb4444,

+                   int width,

+                   int height) {

   int y;

-  void (*I422ToARGB4444Row)(const uint8* y_buf,

-                            const uint8* u_buf,

-                            const uint8* v_buf,

-                            uint8* rgb_buf,

+  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,

+                            const uint8_t* v_buf, uint8_t* rgb_buf,

                             const struct YuvConstants* yuvconstants,

                             int width) = I422ToARGB4444Row_C;

-  if (!src_y || !src_u || !src_v || !dst_argb4444 ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||

+      height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -702,6 +884,14 @@

 #endif

+#if defined(HAS_I422TOARGB4444ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,

@@ -718,20 +908,22 @@

 // Convert I420 to RGB565.

 LIBYUV_API

-int I420ToRGB565(const uint8* src_y, int src_stride_y,

-                 const uint8* src_u, int src_stride_u,

-                 const uint8* src_v, int src_stride_v,

-                 uint8* dst_rgb565, int dst_stride_rgb565,

-                 int width, int height) {

+int I420ToRGB565(const uint8_t* src_y,

+                 int src_stride_y,

+                 const uint8_t* src_u,

+                 int src_stride_u,

+                 const uint8_t* src_v,

+                 int src_stride_v,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height) {

   int y;

-  void (*I422ToRGB565Row)(const uint8* y_buf,

-                          const uint8* u_buf,

-                          const uint8* v_buf,

-                          uint8* rgb_buf,

-                          const struct YuvConstants* yuvconstants,

-                          int width) = I422ToRGB565Row_C;

-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||

-      width <= 0 || height == 0) {

+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,

+                          const uint8_t* v_buf, uint8_t* rgb_buf,

+                          const struct YuvConstants* yuvconstants, int width) =

+      I422ToRGB565Row_C;

+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -764,6 +956,14 @@

 #endif

+#if defined(HAS_I422TORGB565ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB565Row = I422ToRGB565Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);

@@ -777,32 +977,102 @@

   return 0;

+// Convert I422 to RGB565.

+LIBYUV_API

+int I422ToRGB565(const uint8_t* src_y,

+                 int src_stride_y,

+                 const uint8_t* src_u,

+                 int src_stride_u,

+                 const uint8_t* src_v,

+                 int src_stride_v,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height) {

+  int y;

+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,

+                          const uint8_t* v_buf, uint8_t* rgb_buf,

+                          const struct YuvConstants* yuvconstants, int width) =

+      I422ToRGB565Row_C;

+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;

+    dst_stride_rgb565 = -dst_stride_rgb565;

+  }

+#if defined(HAS_I422TORGB565ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_I422TORGB565ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToRGB565Row = I422ToRGB565Row_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_I422TORGB565ROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB565Row = I422ToRGB565Row_NEON;

+    }

+  }

+#endif

+#if defined(HAS_I422TORGB565ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGB565Row = I422ToRGB565Row_MSA;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);

+    dst_rgb565 += dst_stride_rgb565;

+    src_y += src_stride_y;

+    src_u += src_stride_u;

+    src_v += src_stride_v;

+  }

+  return 0;

+}

 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.

-static const uint8 kDither565_4x4[16] = {

-  0, 4, 1, 5,

-  6, 2, 7, 3,

-  1, 5, 0, 4,

-  7, 3, 6, 2,

+static const uint8_t kDither565_4x4[16] = {

+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,

};

 // Convert I420 to RGB565 with dithering.

 LIBYUV_API

-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,

-                       const uint8* src_u, int src_stride_u,

-                       const uint8* src_v, int src_stride_v,

-                       uint8* dst_rgb565, int dst_stride_rgb565,

-                       const uint8* dither4x4, int width, int height) {

+int I420ToRGB565Dither(const uint8_t* src_y,

+                       int src_stride_y,

+                       const uint8_t* src_u,

+                       int src_stride_u,

+                       const uint8_t* src_v,

+                       int src_stride_v,

+                       uint8_t* dst_rgb565,

+                       int dst_stride_rgb565,

+                       const uint8_t* dither4x4,

+                       int width,

+                       int height) {

   int y;

-  void (*I422ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = I422ToARGBRow_C;

-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,

-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;

-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||

-      width <= 0 || height == 0) {

+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I422ToARGBRow_C;

+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,

+                                const uint32_t dither4, int width) =

+      ARGBToRGB565DitherRow_C;

+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -838,12 +1108,12 @@

 #endif

-#if defined(HAS_I422TOARGBROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {

-    I422ToARGBRow = I422ToARGBRow_DSPR2;

+#if defined(HAS_I422TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToARGBRow = I422ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_MSA;

+    }

 #endif

 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)

@@ -870,6 +1140,14 @@

 #endif

+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;

+    }

+  }

+#endif

     // Allocate a row of argb.

     align_buffer_64(row_argb, width * 4);

@@ -876,7 +1154,8 @@

     for (y = 0; y < height; ++y) {

       I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);

       ARGBToRGB565DitherRow(row_argb, dst_rgb565,

-                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);

+                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),

+                            width);

       dst_rgb565 += dst_stride_rgb565;

       src_y += src_stride_y;

       if (y & 1) {

@@ -889,220 +1168,254 @@

   return 0;

+// Convert I420 to AR30 with matrix

+static int I420ToAR30Matrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_u,

+                            int src_stride_u,

+                            const uint8_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_ar30,

+                            int dst_stride_ar30,

+                            const struct YuvConstants* yuvconstants,

+                            int width,

+                            int height) {

+  int y;

+  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I422ToAR30Row_C;

+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;

+    dst_stride_ar30 = -dst_stride_ar30;

+  }

+#if defined(HAS_I422TOAR30ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToAR30Row = I422ToAR30Row_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_I422TOAR30ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToAR30Row = I422ToAR30Row_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      I422ToAR30Row = I422ToAR30Row_AVX2;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);

+    dst_ar30 += dst_stride_ar30;

+    src_y += src_stride_y;

+    if (y & 1) {

+      src_u += src_stride_u;

+      src_v += src_stride_v;

+    }

+  }

+  return 0;

+}

+// Convert I420 to AR30.

+LIBYUV_API

+int I420ToAR30(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height) {

+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_ar30, dst_stride_ar30,

+                          &kYuvI601Constants, width, height);

+}

+// Convert H420 to AR30.

+LIBYUV_API

+int H420ToAR30(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height) {

+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_ar30, dst_stride_ar30,

+                          &kYvuH709Constants, width, height);

+}

 // Convert I420 to specified format

 LIBYUV_API

-int ConvertFromI420(const uint8* y, int y_stride,

-                    const uint8* u, int u_stride,

-                    const uint8* v, int v_stride,

-                    uint8* dst_sample, int dst_sample_stride,

-                    int width, int height,

-                    uint32 fourcc) {

-  uint32 format = CanonicalFourCC(fourcc);

+int ConvertFromI420(const uint8_t* y,

+                    int y_stride,

+                    const uint8_t* u,

+                    int u_stride,

+                    const uint8_t* v,

+                    int v_stride,

+                    uint8_t* dst_sample,

+                    int dst_sample_stride,

+                    int width,

+                    int height,

+                    uint32_t fourcc) {

+  uint32_t format = CanonicalFourCC(fourcc);

   int r = 0;

-  if (!y || !u|| !v || !dst_sample ||

-      width <= 0 || height == 0) {

+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {

     return -1;

   switch (format) {

     // Single plane formats

     case FOURCC_YUY2:

-      r = I420ToYUY2(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width * 2,

-                     width, height);

+      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 2, width,

+                     height);

       break;

     case FOURCC_UYVY:

-      r = I420ToUYVY(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width * 2,

-                     width, height);

+      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 2, width,

+                     height);

       break;

     case FOURCC_RGBP:

-      r = I420ToRGB565(y, y_stride,

-                       u, u_stride,

-                       v, v_stride,

-                       dst_sample,

-                       dst_sample_stride ? dst_sample_stride : width * 2,

-                       width, height);

+      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                       dst_sample_stride ? dst_sample_stride : width * 2, width,

+                       height);

       break;

     case FOURCC_RGBO:

-      r = I420ToARGB1555(y, y_stride,

-                         u, u_stride,

-                         v, v_stride,

-                         dst_sample,

+      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,

                          dst_sample_stride ? dst_sample_stride : width * 2,

                          width, height);

       break;

     case FOURCC_R444:

-      r = I420ToARGB4444(y, y_stride,

-                         u, u_stride,

-                         v, v_stride,

-                         dst_sample,

+      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,

                          dst_sample_stride ? dst_sample_stride : width * 2,

                          width, height);

       break;

     case FOURCC_24BG:

-      r = I420ToRGB24(y, y_stride,

-                      u, u_stride,

-                      v, v_stride,

-                      dst_sample,

-                      dst_sample_stride ? dst_sample_stride : width * 3,

-                      width, height);

+      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                      dst_sample_stride ? dst_sample_stride : width * 3, width,

+                      height);

       break;

     case FOURCC_RAW:

-      r = I420ToRAW(y, y_stride,

-                    u, u_stride,

-                    v, v_stride,

-                    dst_sample,

-                    dst_sample_stride ? dst_sample_stride : width * 3,

-                    width, height);

+      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                    dst_sample_stride ? dst_sample_stride : width * 3, width,

+                    height);

       break;

     case FOURCC_ARGB:

-      r = I420ToARGB(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width * 4,

-                     width, height);

+      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4, width,

+                     height);

       break;

     case FOURCC_BGRA:

-      r = I420ToBGRA(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width * 4,

-                     width, height);

+      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4, width,

+                     height);

       break;

     case FOURCC_ABGR:

-      r = I420ToABGR(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width * 4,

-                     width, height);

+      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4, width,

+                     height);

       break;

     case FOURCC_RGBA:

-      r = I420ToRGBA(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width * 4,

-                     width, height);

+      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4, width,

+                     height);

       break;

+    case FOURCC_AR30:

+      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width * 4, width,

+                     height);

+      break;

     case FOURCC_I400:

-      r = I400Copy(y, y_stride,

-                   dst_sample,

-                   dst_sample_stride ? dst_sample_stride : width,

-                   width, height);

+      r = I400Copy(y, y_stride, dst_sample,

+                   dst_sample_stride ? dst_sample_stride : width, width,

+                   height);

       break;

     case FOURCC_NV12: {

-      uint8* dst_uv = dst_sample + width * height;

-      r = I420ToNV12(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width,

-                     dst_uv,

-                     dst_sample_stride ? dst_sample_stride : width,

-                     width, height);

+      uint8_t* dst_uv = dst_sample + width * height;

+      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width, dst_uv,

+                     dst_sample_stride ? dst_sample_stride : width, width,

+                     height);

       break;

     case FOURCC_NV21: {

-      uint8* dst_vu = dst_sample + width * height;

-      r = I420ToNV21(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample,

-                     dst_sample_stride ? dst_sample_stride : width,

-                     dst_vu,

-                     dst_sample_stride ? dst_sample_stride : width,

-                     width, height);

+      uint8_t* dst_vu = dst_sample + width * height;

+      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride ? dst_sample_stride : width, dst_vu,

+                     dst_sample_stride ? dst_sample_stride : width, width,

+                     height);

       break;

     // TODO(fbarchard): Add M420.

     // Triplanar formats

-    // TODO(fbarchard): halfstride instead of halfwidth

     case FOURCC_I420:

     case FOURCC_YV12: {

-      int halfwidth = (width + 1) / 2;

+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;

+      int halfstride = (dst_sample_stride + 1) / 2;

       int halfheight = (height + 1) / 2;

-      uint8* dst_u;

-      uint8* dst_v;

+      uint8_t* dst_u;

+      uint8_t* dst_v;

       if (format == FOURCC_YV12) {

-        dst_v = dst_sample + width * height;

-        dst_u = dst_v + halfwidth * halfheight;

+        dst_v = dst_sample + dst_sample_stride * height;

+        dst_u = dst_v + halfstride * halfheight;

       } else {

-        dst_u = dst_sample + width * height;

-        dst_v = dst_u + halfwidth * halfheight;

+        dst_u = dst_sample + dst_sample_stride * height;

+        dst_v = dst_u + halfstride * halfheight;

-      r = I420Copy(y, y_stride,

-                   u, u_stride,

-                   v, v_stride,

-                   dst_sample, width,

-                   dst_u, halfwidth,

-                   dst_v, halfwidth,

+      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,

                    width, height);

       break;

     case FOURCC_I422:

     case FOURCC_YV16: {

-      int halfwidth = (width + 1) / 2;

-      uint8* dst_u;

-      uint8* dst_v;

+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;

+      int halfstride = (dst_sample_stride + 1) / 2;

+      uint8_t* dst_u;

+      uint8_t* dst_v;

       if (format == FOURCC_YV16) {

-        dst_v = dst_sample + width * height;

-        dst_u = dst_v + halfwidth * height;

+        dst_v = dst_sample + dst_sample_stride * height;

+        dst_u = dst_v + halfstride * height;

       } else {

-        dst_u = dst_sample + width * height;

-        dst_v = dst_u + halfwidth * height;

+        dst_u = dst_sample + dst_sample_stride * height;

+        dst_v = dst_u + halfstride * height;

-      r = I420ToI422(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample, width,

-                     dst_u, halfwidth,

-                     dst_v, halfwidth,

+      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,

                      width, height);

       break;

     case FOURCC_I444:

     case FOURCC_YV24: {

-      uint8* dst_u;

-      uint8* dst_v;

+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;

+      uint8_t* dst_u;

+      uint8_t* dst_v;

       if (format == FOURCC_YV24) {

-        dst_v = dst_sample + width * height;

-        dst_u = dst_v + width * height;

+        dst_v = dst_sample + dst_sample_stride * height;

+        dst_u = dst_v + dst_sample_stride * height;

       } else {

-        dst_u = dst_sample + width * height;

-        dst_v = dst_u + width * height;

+        dst_u = dst_sample + dst_sample_stride * height;

+        dst_v = dst_u + dst_sample_stride * height;

-      r = I420ToI444(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample, width,

-                     dst_u, width,

-                     dst_v, width,

-                     width, height);

+      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,

+                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,

+                     dst_sample_stride, width, height);

       break;

-    case FOURCC_I411: {

-      int quarterwidth = (width + 3) / 4;

-      uint8* dst_u = dst_sample + width * height;

-      uint8* dst_v = dst_u + quarterwidth * height;

-      r = I420ToI411(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     dst_sample, width,

-                     dst_u, quarterwidth,

-                     dst_v, quarterwidth,

-                     width, height);

-      break;

-    }

     // Formats not supported - MJPG, biplanar, some rgb formats.

     default:

       return -1;  // unknown fourcc - return failure code.

--- a/third_party/libyuv/source/convert_from_argb.cc

+++ b/third_party/libyuv/source/convert_from_argb.cc

@@ -22,16 +22,21 @@

 // ARGB little endian (bgra in memory) to I444

 LIBYUV_API

-int ARGBToI444(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int ARGBToI444(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

-  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int width) = ARGBToUV444Row_C;

+  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,

+                         uint8_t* dst_v, int width) = ARGBToUV444Row_C;

   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

@@ -41,20 +46,18 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_y == width &&

-      dst_stride_u == width &&

-      dst_stride_v == width) {

+  if (src_stride_argb == width * 4 && dst_stride_y == width &&

+      dst_stride_u == width && dst_stride_v == width) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

 #if defined(HAS_ARGBTOUV444ROW_SSSE3)

-    if (TestCpuFlag(kCpuHasSSSE3)) {

-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;

-      if (IS_ALIGNED(width, 16)) {

-        ARGBToUV444Row = ARGBToUV444Row_SSSE3;

-      }

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUV444Row = ARGBToUV444Row_SSSE3;

+    }

 #endif

 #if defined(HAS_ARGBTOUV444ROW_NEON)

@@ -65,6 +68,14 @@

 #endif

+#if defined(HAS_ARGBTOUV444ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToUV444Row = ARGBToUV444Row_MSA;

+    }

+  }

+#endif

 #if defined(HAS_ARGBTOYROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     ARGBToYRow = ARGBToYRow_Any_SSSE3;

@@ -89,6 +100,14 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToUV444Row(src_argb, dst_u, dst_v, width);

@@ -103,19 +122,23 @@

 // ARGB little endian (bgra in memory) to I422

 LIBYUV_API

-int ARGBToI422(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int ARGBToI422(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

-  if (!src_argb ||

-      !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -125,10 +148,8 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_y == width &&

-      dst_stride_u * 2 == width &&

-      dst_stride_v * 2 == width) {

+  if (src_stride_argb == width * 4 && dst_stride_y == width &&

+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

@@ -170,81 +191,25 @@

 #endif

-  for (y = 0; y < height; ++y) {

-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);

-    ARGBToYRow(src_argb, dst_y, width);

-    src_argb += src_stride_argb;

-    dst_y += dst_stride_y;

-    dst_u += dst_stride_u;

-    dst_v += dst_stride_v;

-  }

-  return 0;

-}

-// ARGB little endian (bgra in memory) to I411

-LIBYUV_API

-int ARGBToI411(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

-  int y;

-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-      int width) = ARGBToUV411Row_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

-      ARGBToYRow_C;

-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

-    return -1;

-  }

-  if (height < 0) {

-    height = -height;

-    src_argb = src_argb + (height - 1) * src_stride_argb;

-    src_stride_argb = -src_stride_argb;

-  }

-  // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_y == width &&

-      dst_stride_u * 4 == width &&

-      dst_stride_v * 4 == width) {

-    width *= height;

-    height = 1;

-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;

-  }

-#if defined(HAS_ARGBTOYROW_SSSE3)

-  if (TestCpuFlag(kCpuHasSSSE3)) {

-    ARGBToYRow = ARGBToYRow_Any_SSSE3;

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

     if (IS_ALIGNED(width, 16)) {

-      ARGBToYRow = ARGBToYRow_SSSE3;

+      ARGBToYRow = ARGBToYRow_MSA;

 #endif

-#if defined(HAS_ARGBTOYROW_AVX2)

-  if (TestCpuFlag(kCpuHasAVX2)) {

-    ARGBToYRow = ARGBToYRow_Any_AVX2;

+#if defined(HAS_ARGBTOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVRow = ARGBToUVRow_Any_MSA;

     if (IS_ALIGNED(width, 32)) {

-      ARGBToYRow = ARGBToYRow_AVX2;

+      ARGBToUVRow = ARGBToUVRow_MSA;

 #endif

-#if defined(HAS_ARGBTOYROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    ARGBToYRow = ARGBToYRow_Any_NEON;

-    if (IS_ALIGNED(width, 8)) {

-      ARGBToYRow = ARGBToYRow_NEON;

-    }

-  }

-#endif

-#if defined(HAS_ARGBTOUV411ROW_NEON)

-  if (TestCpuFlag(kCpuHasNEON)) {

-    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;

-    if (IS_ALIGNED(width, 32)) {

-      ARGBToUV411Row = ARGBToUV411Row_NEON;

-    }

-  }

-#endif

   for (y = 0; y < height; ++y) {

-    ARGBToUV411Row(src_argb, dst_u, dst_v, width);

+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);

     ARGBToYRow(src_argb, dst_y, width);

     src_argb += src_stride_argb;

     dst_y += dst_stride_y;

@@ -255,21 +220,24 @@

 LIBYUV_API

-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height) {

+int ARGBToNV12(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height) {

   int y;

   int halfwidth = (width + 1) >> 1;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

-                      int width) = MergeUVRow_C;

-  if (!src_argb ||

-      !dst_y || !dst_uv ||

-      width <= 0 || height == 0) {

+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,

+                      uint8_t* dst_uv, int width) = MergeUVRow_C;

+  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -314,6 +282,22 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVRow = ARGBToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_MERGEUVROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     MergeUVRow_ = MergeUVRow_Any_SSE2;

@@ -338,10 +322,18 @@

 #endif

+#if defined(HAS_MERGEUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    MergeUVRow_ = MergeUVRow_Any_MSA;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_MSA;

+    }

+  }

+#endif

     // Allocate a rows of uv.

     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);

-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);

+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);

     for (y = 0; y < height - 1; y += 2) {

       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);

@@ -364,21 +356,24 @@

 // Same as NV12 but U and V swapped.

 LIBYUV_API

-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height) {

+int ARGBToNV21(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_vu,

+               int dst_stride_vu,

+               int width,

+               int height) {

   int y;

   int halfwidth = (width + 1) >> 1;

-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

-                      int width) = MergeUVRow_C;

-  if (!src_argb ||

-      !dst_y || !dst_uv ||

-      width <= 0 || height == 0) {

+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,

+                      uint8_t* dst_vu, int width) = MergeUVRow_C;

+  if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -423,6 +418,22 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVRow = ARGBToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_MERGEUVROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     MergeUVRow_ = MergeUVRow_Any_SSE2;

@@ -447,23 +458,31 @@

 #endif

+#if defined(HAS_MERGEUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    MergeUVRow_ = MergeUVRow_Any_MSA;

+    if (IS_ALIGNED(halfwidth, 16)) {

+      MergeUVRow_ = MergeUVRow_MSA;

+    }

+  }

+#endif

     // Allocate a rows of uv.

     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);

-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);

+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);

     for (y = 0; y < height - 1; y += 2) {

       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);

-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);

+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);

       ARGBToYRow(src_argb, dst_y, width);

       ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);

       src_argb += src_stride_argb * 2;

       dst_y += dst_stride_y * 2;

-      dst_uv += dst_stride_uv;

+      dst_vu += dst_stride_vu;

     if (height & 1) {

       ARGBToUVRow(src_argb, 0, row_u, row_v, width);

-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);

+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);

       ARGBToYRow(src_argb, dst_y, width);

     free_aligned_buffer_64(row_u);

@@ -473,19 +492,23 @@

 // Convert ARGB to YUY2.

 LIBYUV_API

-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yuy2, int dst_stride_yuy2,

-               int width, int height) {

+int ARGBToYUY2(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yuy2,

+               int dst_stride_yuy2,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,

-      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;

+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,

+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =

+      I422ToYUY2Row_C;

-  if (!src_argb || !dst_yuy2 ||

-      width <= 0 || height == 0) {

+  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -495,8 +518,7 @@

     dst_stride_yuy2 = -dst_stride_yuy2;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_yuy2 == width * 2) {

+  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_yuy2 = 0;

@@ -537,6 +559,22 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVRow = ARGBToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_I422TOYUY2ROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;

@@ -545,6 +583,14 @@

 #endif

+#if defined(HAS_I422TOYUY2ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToYUY2Row = I422ToYUY2Row_AVX2;

+    }

+  }

+#endif

 #if defined(HAS_I422TOYUY2ROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;

@@ -553,12 +599,20 @@

 #endif

+#if defined(HAS_I422TOYUY2ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToYUY2Row = I422ToYUY2Row_MSA;

+    }

+  }

+#endif

     // Allocate a rows of yuv.

     align_buffer_64(row_y, ((width + 63) & ~63) * 2);

-    uint8* row_u = row_y + ((width + 63) & ~63);

-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+    uint8_t* row_u = row_y + ((width + 63) & ~63);

+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;

     for (y = 0; y < height; ++y) {

       ARGBToUVRow(src_argb, 0, row_u, row_v, width);

@@ -575,19 +629,23 @@

 // Convert ARGB to UYVY.

 LIBYUV_API

-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_uyvy, int dst_stride_uyvy,

-               int width, int height) {

+int ARGBToUYVY(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_uyvy,

+               int dst_stride_uyvy,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,

-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,

+                      uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVRow_C;

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,

-      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;

+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,

+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =

+      I422ToUYVYRow_C;

-  if (!src_argb || !dst_uyvy ||

-      width <= 0 || height == 0) {

+  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -597,8 +655,7 @@

     dst_stride_uyvy = -dst_stride_uyvy;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_uyvy == width * 2) {

+  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_uyvy = 0;

@@ -639,6 +696,22 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVRow = ARGBToUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVRow = ARGBToUVRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_I422TOUYVYROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;

@@ -647,6 +720,14 @@

 #endif

+#if defined(HAS_I422TOUYVYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToUYVYRow = I422ToUYVYRow_AVX2;

+    }

+  }

+#endif

 #if defined(HAS_I422TOUYVYROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;

@@ -655,12 +736,20 @@

 #endif

+#if defined(HAS_I422TOUYVYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      I422ToUYVYRow = I422ToUYVYRow_MSA;

+    }

+  }

+#endif

     // Allocate a rows of yuv.

     align_buffer_64(row_y, ((width + 63) & ~63) * 2);

-    uint8* row_u = row_y + ((width + 63) & ~63);

-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;

+    uint8_t* row_u = row_y + ((width + 63) & ~63);

+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;

     for (y = 0; y < height; ++y) {

       ARGBToUVRow(src_argb, 0, row_u, row_v, width);

@@ -677,11 +766,14 @@

 // Convert ARGB to I400.

 LIBYUV_API

-int ARGBToI400(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height) {

+int ARGBToI400(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =

+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =

       ARGBToYRow_C;

   if (!src_argb || !dst_y || width <= 0 || height == 0) {

     return -1;

@@ -692,8 +784,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_y == width) {

+  if (src_stride_argb == width * 4 && dst_stride_y == width) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_y = 0;

@@ -722,6 +813,14 @@

 #endif

+#if defined(HAS_ARGBTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYRow = ARGBToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYRow = ARGBToYRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToYRow(src_argb, dst_y, width);

@@ -732,28 +831,31 @@

 // Shuffle table for converting ARGB to RGBA.

-static uvec8 kShuffleMaskARGBToRGBA = {

-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u

-};

+static const uvec8 kShuffleMaskARGBToRGBA = {

+    3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};

 // Convert ARGB to RGBA.

 LIBYUV_API

-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_rgba, int dst_stride_rgba,

-               int width, int height) {

-  return ARGBShuffle(src_argb, src_stride_argb,

-                     dst_rgba, dst_stride_rgba,

-                     (const uint8*)(&kShuffleMaskARGBToRGBA),

-                     width, height);

+int ARGBToRGBA(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_rgba,

+               int dst_stride_rgba,

+               int width,

+               int height) {

+  return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,

+                     (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);

 // Convert ARGB To RGB24.

 LIBYUV_API

-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,

-                uint8* dst_rgb24, int dst_stride_rgb24,

-                int width, int height) {

+int ARGBToRGB24(const uint8_t* src_argb,

+                int src_stride_argb,

+                uint8_t* dst_rgb24,

+                int dst_stride_rgb24,

+                int width,

+                int height) {

   int y;

-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

+  void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =

       ARGBToRGB24Row_C;

   if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {

     return -1;

@@ -764,8 +866,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_rgb24 == width * 3) {

+  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_rgb24 = 0;

@@ -778,6 +879,22 @@

 #endif

+#if defined(HAS_ARGBTORGB24ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToRGB24Row = ARGBToRGB24Row_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)

+  if (TestCpuFlag(kCpuHasAVX512VBMI)) {

+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;

+    }

+  }

+#endif

 #if defined(HAS_ARGBTORGB24ROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;

@@ -786,6 +903,14 @@

 #endif

+#if defined(HAS_ARGBTORGB24ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToRGB24Row = ARGBToRGB24Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToRGB24Row(src_argb, dst_rgb24, width);

@@ -797,11 +922,14 @@

 // Convert ARGB To RAW.

 LIBYUV_API

-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,

-              uint8* dst_raw, int dst_stride_raw,

-              int width, int height) {

+int ARGBToRAW(const uint8_t* src_argb,

+              int src_stride_argb,

+              uint8_t* dst_raw,

+              int dst_stride_raw,

+              int width,

+              int height) {

   int y;

-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =

+  void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =

       ARGBToRAWRow_C;

   if (!src_argb || !dst_raw || width <= 0 || height == 0) {

     return -1;

@@ -812,8 +940,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_raw == width * 3) {

+  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_raw = 0;

@@ -826,6 +953,14 @@

 #endif

+#if defined(HAS_ARGBTORAWROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToRAWRow = ARGBToRAWRow_AVX2;

+    }

+  }

+#endif

 #if defined(HAS_ARGBTORAWROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     ARGBToRAWRow = ARGBToRAWRow_Any_NEON;

@@ -834,6 +969,14 @@

 #endif

+#if defined(HAS_ARGBTORAWROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToRAWRow = ARGBToRAWRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToRAWRow = ARGBToRAWRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToRAWRow(src_argb, dst_raw, width);

@@ -844,21 +987,23 @@

 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.

-static const uint8 kDither565_4x4[16] = {

-  0, 4, 1, 5,

-  6, 2, 7, 3,

-  1, 5, 0, 4,

-  7, 3, 6, 2,

+static const uint8_t kDither565_4x4[16] = {

+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,

};

 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).

 LIBYUV_API

-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_rgb565, int dst_stride_rgb565,

-                       const uint8* dither4x4, int width, int height) {

+int ARGBToRGB565Dither(const uint8_t* src_argb,

+                       int src_stride_argb,

+                       uint8_t* dst_rgb565,

+                       int dst_stride_rgb565,

+                       const uint8_t* dither4x4,

+                       int width,

+                       int height) {

   int y;

-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,

-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;

+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,

+                                const uint32_t dither4, int width) =

+      ARGBToRGB565DitherRow_C;

   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {

     return -1;

@@ -894,9 +1039,19 @@

 #endif

+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToRGB565DitherRow(src_argb, dst_rgb565,

-                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);

+                          *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),

+                          width);

     src_argb += src_stride_argb;

     dst_rgb565 += dst_stride_rgb565;

@@ -906,12 +1061,15 @@

 // Convert ARGB To RGB565.

 // TODO(fbarchard): Consider using dither function low level with zeros.

 LIBYUV_API

-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,

-                 uint8* dst_rgb565, int dst_stride_rgb565,

-                 int width, int height) {

+int ARGBToRGB565(const uint8_t* src_argb,

+                 int src_stride_argb,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height) {

   int y;

-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

-      ARGBToRGB565Row_C;

+  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,

+                          int width) = ARGBToRGB565Row_C;

   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {

     return -1;

@@ -921,8 +1079,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_rgb565 == width * 2) {

+  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_rgb565 = 0;

@@ -951,6 +1108,14 @@

 #endif

+#if defined(HAS_ARGBTORGB565ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToRGB565Row = ARGBToRGB565Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToRGB565Row(src_argb, dst_rgb565, width);

@@ -962,12 +1127,15 @@

 // Convert ARGB To ARGB1555.

 LIBYUV_API

-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,

-                   uint8* dst_argb1555, int dst_stride_argb1555,

-                   int width, int height) {

+int ARGBToARGB1555(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb1555,

+                   int dst_stride_argb1555,

+                   int width,

+                   int height) {

   int y;

-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

-      ARGBToARGB1555Row_C;

+  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,

+                            int width) = ARGBToARGB1555Row_C;

   if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {

     return -1;

@@ -977,8 +1145,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb1555 == width * 2) {

+  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb1555 = 0;

@@ -1007,6 +1174,14 @@

 #endif

+#if defined(HAS_ARGBTOARGB1555ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToARGB1555Row(src_argb, dst_argb1555, width);

@@ -1018,12 +1193,15 @@

 // Convert ARGB To ARGB4444.

 LIBYUV_API

-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,

-                   uint8* dst_argb4444, int dst_stride_argb4444,

-                   int width, int height) {

+int ARGBToARGB4444(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb4444,

+                   int dst_stride_argb4444,

+                   int width,

+                   int height) {

   int y;

-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =

-      ARGBToARGB4444Row_C;

+  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,

+                            int width) = ARGBToARGB4444Row_C;

   if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {

     return -1;

@@ -1033,8 +1211,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb4444 == width * 2) {

+  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb4444 = 0;

@@ -1063,6 +1240,14 @@

 #endif

+#if defined(HAS_ARGBTOARGB4444ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToARGB4444Row(src_argb, dst_argb4444, width);

@@ -1072,21 +1257,123 @@

   return 0;

+// Convert ABGR To AR30.

+LIBYUV_API

+int ABGRToAR30(const uint8_t* src_abgr,

+               int src_stride_abgr,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height) {

+  int y;

+  void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =

+      ABGRToAR30Row_C;

+  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;

+    src_stride_abgr = -src_stride_abgr;

+  }

+  // Coalesce rows.

+  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_abgr = dst_stride_ar30 = 0;

+  }

+#if defined(HAS_ABGRTOAR30ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 4)) {

+      ABGRToAR30Row = ABGRToAR30Row_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ABGRTOAR30ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ABGRToAR30Row = ABGRToAR30Row_AVX2;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ABGRToAR30Row(src_abgr, dst_ar30, width);

+    src_abgr += src_stride_abgr;

+    dst_ar30 += dst_stride_ar30;

+  }

+  return 0;

+}

+// Convert ARGB To AR30.

+LIBYUV_API

+int ARGBToAR30(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_ar30,

+               int dst_stride_ar30,

+               int width,

+               int height) {

+  int y;

+  void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =

+      ARGBToAR30Row_C;

+  if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {

+    return -1;

+  }

+  if (height < 0) {

+    height = -height;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

+  }

+  // Coalesce rows.

+  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {

+    width *= height;

+    height = 1;

+    src_stride_argb = dst_stride_ar30 = 0;

+  }

+#if defined(HAS_ARGBTOAR30ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBToAR30Row = ARGBToAR30Row_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOAR30ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBToAR30Row = ARGBToAR30Row_AVX2;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    ARGBToAR30Row(src_argb, dst_ar30, width);

+    src_argb += src_stride_argb;

+    dst_ar30 += dst_stride_ar30;

+  }

+  return 0;

+}

 // Convert ARGB to J420. (JPeg full range I420).

 LIBYUV_API

-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yj, int dst_stride_yj,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int ARGBToJ420(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yj,

+               int dst_stride_yj,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =

+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,

+                       uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVJRow_C;

+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =

       ARGBToYJRow_C;

-  if (!src_argb ||

-      !dst_yj || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1129,6 +1416,22 @@

 #endif

+#if defined(HAS_ARGBTOYJROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYJRow = ARGBToYJRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYJRow = ARGBToYJRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVJROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVJRow = ARGBToUVJRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height - 1; y += 2) {

     ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);

@@ -1148,19 +1451,23 @@

 // Convert ARGB to J422. (JPeg full range I422).

 LIBYUV_API

-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yj, int dst_stride_yj,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int ARGBToJ422(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yj,

+               int dst_stride_yj,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =

+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,

+                       uint8_t* dst_u, uint8_t* dst_v, int width) =

+      ARGBToUVJRow_C;

+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =

       ARGBToYJRow_C;

-  if (!src_argb ||

-      !dst_yj || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1170,10 +1477,8 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_yj == width &&

-      dst_stride_u * 2 == width &&

-      dst_stride_v * 2 == width) {

+  if (src_stride_argb == width * 4 && dst_stride_yj == width &&

+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;

@@ -1212,6 +1517,22 @@

 #endif

+#if defined(HAS_ARGBTOYJROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYJRow = ARGBToYJRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYJRow = ARGBToYJRow_MSA;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOUVJROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      ARGBToUVJRow = ARGBToUVJRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);

@@ -1226,11 +1547,14 @@

 // Convert ARGB to J400.

 LIBYUV_API

-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_yj, int dst_stride_yj,

-               int width, int height) {

+int ARGBToJ400(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_yj,

+               int dst_stride_yj,

+               int width,

+               int height) {

   int y;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =

+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =

       ARGBToYJRow_C;

   if (!src_argb || !dst_yj || width <= 0 || height == 0) {

     return -1;

@@ -1241,8 +1565,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_yj == width) {

+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_yj = 0;

@@ -1268,6 +1591,14 @@

     ARGBToYJRow = ARGBToYJRow_Any_NEON;

     if (IS_ALIGNED(width, 8)) {

       ARGBToYJRow = ARGBToYJRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_ARGBTOYJROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYJRow = ARGBToYJRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYJRow = ARGBToYJRow_MSA;

 #endif

--- a/third_party/libyuv/source/convert_jpeg.cc

+++ b/third_party/libyuv/source/convert_jpeg.cc

@@ -22,11 +22,11 @@

 #ifdef HAVE_JPEG

 struct I420Buffers {

-  uint8* y;

+  uint8_t* y;

   int y_stride;

-  uint8* u;

+  uint8_t* u;

   int u_stride;

-  uint8* v;

+  uint8_t* v;

   int v_stride;

   int w;

   int h;

@@ -33,17 +33,13 @@

};

 static void JpegCopyI420(void* opaque,

-                         const uint8* const* data,

+                         const uint8_t* const* data,

                          const int* strides,

                          int rows) {

   I420Buffers* dest = (I420Buffers*)(opaque);

-  I420Copy(data[0], strides[0],

-           data[1], strides[1],

-           data[2], strides[2],

-           dest->y, dest->y_stride,

-           dest->u, dest->u_stride,

-           dest->v, dest->v_stride,

-           dest->w, rows);

+  I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],

+           dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,

+           dest->v_stride, dest->w, rows);

   dest->y += rows * dest->y_stride;

   dest->u += ((rows + 1) >> 1) * dest->u_stride;

   dest->v += ((rows + 1) >> 1) * dest->v_stride;

@@ -51,17 +47,13 @@

 static void JpegI422ToI420(void* opaque,

-                           const uint8* const* data,

+                           const uint8_t* const* data,

                            const int* strides,

                            int rows) {

   I420Buffers* dest = (I420Buffers*)(opaque);

-  I422ToI420(data[0], strides[0],

-             data[1], strides[1],

-             data[2], strides[2],

-             dest->y, dest->y_stride,

-             dest->u, dest->u_stride,

-             dest->v, dest->v_stride,

-             dest->w, rows);

+  I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],

+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,

+             dest->v_stride, dest->w, rows);

   dest->y += rows * dest->y_stride;

   dest->u += ((rows + 1) >> 1) * dest->u_stride;

   dest->v += ((rows + 1) >> 1) * dest->v_stride;

@@ -69,17 +61,13 @@

 static void JpegI444ToI420(void* opaque,

-                           const uint8* const* data,

+                           const uint8_t* const* data,

                            const int* strides,

                            int rows) {

   I420Buffers* dest = (I420Buffers*)(opaque);

-  I444ToI420(data[0], strides[0],

-             data[1], strides[1],

-             data[2], strides[2],

-             dest->y, dest->y_stride,

-             dest->u, dest->u_stride,

-             dest->v, dest->v_stride,

-             dest->w, rows);

+  I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],

+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,

+             dest->v_stride, dest->w, rows);

   dest->y += rows * dest->y_stride;

   dest->u += ((rows + 1) >> 1) * dest->u_stride;

   dest->v += ((rows + 1) >> 1) * dest->v_stride;

@@ -86,34 +74,13 @@

   dest->h -= rows;

-static void JpegI411ToI420(void* opaque,

-                           const uint8* const* data,

-                           const int* strides,

-                           int rows) {

-  I420Buffers* dest = (I420Buffers*)(opaque);

-  I411ToI420(data[0], strides[0],

-             data[1], strides[1],

-             data[2], strides[2],

-             dest->y, dest->y_stride,

-             dest->u, dest->u_stride,

-             dest->v, dest->v_stride,

-             dest->w, rows);

-  dest->y += rows * dest->y_stride;

-  dest->u += ((rows + 1) >> 1) * dest->u_stride;

-  dest->v += ((rows + 1) >> 1) * dest->v_stride;

-  dest->h -= rows;

-}

 static void JpegI400ToI420(void* opaque,

-                           const uint8* const* data,

+                           const uint8_t* const* data,

                            const int* strides,

                            int rows) {

   I420Buffers* dest = (I420Buffers*)(opaque);

-  I400ToI420(data[0], strides[0],

-             dest->y, dest->y_stride,

-             dest->u, dest->u_stride,

-             dest->v, dest->v_stride,

-             dest->w, rows);

+  I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,

+             dest->u_stride, dest->v, dest->v_stride, dest->w, rows);

   dest->y += rows * dest->y_stride;

   dest->u += ((rows + 1) >> 1) * dest->u_stride;

   dest->v += ((rows + 1) >> 1) * dest->v_stride;

@@ -122,8 +89,10 @@

 // Query size of MJPG in pixels.

 LIBYUV_API

-int MJPGSize(const uint8* sample, size_t sample_size,

-             int* width, int* height) {

+int MJPGSize(const uint8_t* sample,

+             size_t sample_size,

+             int* width,

+             int* height) {

   MJpegDecoder mjpeg_decoder;

   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);

   if (ret) {

@@ -135,15 +104,21 @@

 // MJPG (Motion JPeg) to I420

-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.

+// TODO(fbarchard): review src_width and src_height requirement. dst_width and

+// dst_height may be enough.

 LIBYUV_API

-int MJPGToI420(const uint8* sample,

+int MJPGToI420(const uint8_t* sample,

                size_t sample_size,

-               uint8* y, int y_stride,

-               uint8* u, int u_stride,

-               uint8* v, int v_stride,

-               int w, int h,

-               int dw, int dh) {

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int src_width,

+               int src_height,

+               int dst_width,

+               int dst_height) {

   if (sample_size == kUnknownDataSize) {

     // ERROR: MJPEG frame size unknown

     return -1;

@@ -152,17 +127,17 @@

   // TODO(fbarchard): Port MJpeg to C.

   MJpegDecoder mjpeg_decoder;

   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);

-  if (ret && (mjpeg_decoder.GetWidth() != w ||

-              mjpeg_decoder.GetHeight() != h)) {

+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||

+              mjpeg_decoder.GetHeight() != src_height)) {

     // ERROR: MJPEG frame has unexpected dimensions

     mjpeg_decoder.UnloadFrame();

     return 1;  // runtime failure

   if (ret) {

-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };

+    I420Buffers bufs = {dst_y, dst_stride_y, dst_u,     dst_stride_u,

+                        dst_v, dst_stride_v, dst_width, dst_height};

     // YUV420

-    if (mjpeg_decoder.GetColorSpace() ==

-            MJpegDecoder::kColorSpaceYCbCr &&

+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&

         mjpeg_decoder.GetNumComponents() == 3 &&

         mjpeg_decoder.GetVertSampFactor(0) == 2 &&

         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&

@@ -170,8 +145,9 @@

         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

         mjpeg_decoder.GetVertSampFactor(2) == 1 &&

         mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);

-    // YUV422

+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,

+                                           dst_height);

+      // YUV422

     } else if (mjpeg_decoder.GetColorSpace() ==

                    MJpegDecoder::kColorSpaceYCbCr &&

                mjpeg_decoder.GetNumComponents() == 3 &&

@@ -181,8 +157,9 @@

                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

                mjpeg_decoder.GetVertSampFactor(2) == 1 &&

                mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);

-    // YUV444

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,

+                                           dst_height);

+      // YUV444

     } else if (mjpeg_decoder.GetColorSpace() ==

                    MJpegDecoder::kColorSpaceYCbCr &&

                mjpeg_decoder.GetNumComponents() == 3 &&

@@ -192,28 +169,19 @@

                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

                mjpeg_decoder.GetVertSampFactor(2) == 1 &&

                mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);

-    // YUV411

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,

+                                           dst_height);

+      // YUV400

     } else if (mjpeg_decoder.GetColorSpace() ==

-                   MJpegDecoder::kColorSpaceYCbCr &&

-               mjpeg_decoder.GetNumComponents() == 3 &&

-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&

-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);

-    // YUV400

-    } else if (mjpeg_decoder.GetColorSpace() ==

                    MJpegDecoder::kColorSpaceGrayscale &&

                mjpeg_decoder.GetNumComponents() == 1 &&

                mjpeg_decoder.GetVertSampFactor(0) == 1 &&

                mjpeg_decoder.GetHorizSampFactor(0) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,

+                                           dst_height);

     } else {

       // TODO(fbarchard): Implement conversion for any other colorspace/sample

-      // factors that occur in practice. 411 is supported by libjpeg

+      // factors that occur in practice.

       // ERROR: Unable to convert MJPEG frame because format is not supported

       mjpeg_decoder.UnloadFrame();

       return 1;

@@ -224,7 +192,7 @@

 #ifdef HAVE_JPEG

 struct ARGBBuffers {

-  uint8* argb;

+  uint8_t* argb;

   int argb_stride;

   int w;

   int h;

@@ -231,81 +199,60 @@

};

 static void JpegI420ToARGB(void* opaque,

-                         const uint8* const* data,

-                         const int* strides,

-                         int rows) {

+                           const uint8_t* const* data,

+                           const int* strides,

+                           int rows) {

   ARGBBuffers* dest = (ARGBBuffers*)(opaque);

-  I420ToARGB(data[0], strides[0],

-             data[1], strides[1],

-             data[2], strides[2],

-             dest->argb, dest->argb_stride,

-             dest->w, rows);

+  I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],

+             dest->argb, dest->argb_stride, dest->w, rows);

   dest->argb += rows * dest->argb_stride;

   dest->h -= rows;

 static void JpegI422ToARGB(void* opaque,

-                           const uint8* const* data,

+                           const uint8_t* const* data,

                            const int* strides,

                            int rows) {

   ARGBBuffers* dest = (ARGBBuffers*)(opaque);

-  I422ToARGB(data[0], strides[0],

-             data[1], strides[1],

-             data[2], strides[2],

-             dest->argb, dest->argb_stride,

-             dest->w, rows);

+  I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],

+             dest->argb, dest->argb_stride, dest->w, rows);

   dest->argb += rows * dest->argb_stride;

   dest->h -= rows;

 static void JpegI444ToARGB(void* opaque,

-                           const uint8* const* data,

+                           const uint8_t* const* data,

                            const int* strides,

                            int rows) {

   ARGBBuffers* dest = (ARGBBuffers*)(opaque);

-  I444ToARGB(data[0], strides[0],

-             data[1], strides[1],

-             data[2], strides[2],

-             dest->argb, dest->argb_stride,

-             dest->w, rows);

+  I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],

+             dest->argb, dest->argb_stride, dest->w, rows);

   dest->argb += rows * dest->argb_stride;

   dest->h -= rows;

-static void JpegI411ToARGB(void* opaque,

-                           const uint8* const* data,

-                           const int* strides,

-                           int rows) {

-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);

-  I411ToARGB(data[0], strides[0],

-             data[1], strides[1],

-             data[2], strides[2],

-             dest->argb, dest->argb_stride,

-             dest->w, rows);

-  dest->argb += rows * dest->argb_stride;

-  dest->h -= rows;

-}

 static void JpegI400ToARGB(void* opaque,

-                           const uint8* const* data,

+                           const uint8_t* const* data,

                            const int* strides,

                            int rows) {

   ARGBBuffers* dest = (ARGBBuffers*)(opaque);

-  I400ToARGB(data[0], strides[0],

-             dest->argb, dest->argb_stride,

-             dest->w, rows);

+  I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);

   dest->argb += rows * dest->argb_stride;

   dest->h -= rows;

 // MJPG (Motion JPeg) to ARGB

-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.

+// TODO(fbarchard): review src_width and src_height requirement. dst_width and

+// dst_height may be enough.

 LIBYUV_API

-int MJPGToARGB(const uint8* sample,

+int MJPGToARGB(const uint8_t* sample,

                size_t sample_size,

-               uint8* argb, int argb_stride,

-               int w, int h,

-               int dw, int dh) {

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int src_width,

+               int src_height,

+               int dst_width,

+               int dst_height) {

   if (sample_size == kUnknownDataSize) {

     // ERROR: MJPEG frame size unknown

     return -1;

@@ -314,17 +261,16 @@

   // TODO(fbarchard): Port MJpeg to C.

   MJpegDecoder mjpeg_decoder;

   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);

-  if (ret && (mjpeg_decoder.GetWidth() != w ||

-              mjpeg_decoder.GetHeight() != h)) {

+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||

+              mjpeg_decoder.GetHeight() != src_height)) {

     // ERROR: MJPEG frame has unexpected dimensions

     mjpeg_decoder.UnloadFrame();

     return 1;  // runtime failure

   if (ret) {

-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };

+    ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};

     // YUV420

-    if (mjpeg_decoder.GetColorSpace() ==

-            MJpegDecoder::kColorSpaceYCbCr &&

+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&

         mjpeg_decoder.GetNumComponents() == 3 &&

         mjpeg_decoder.GetVertSampFactor(0) == 2 &&

         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&

@@ -332,8 +278,9 @@

         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

         mjpeg_decoder.GetVertSampFactor(2) == 1 &&

         mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);

-    // YUV422

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,

+                                           dst_height);

+      // YUV422

     } else if (mjpeg_decoder.GetColorSpace() ==

                    MJpegDecoder::kColorSpaceYCbCr &&

                mjpeg_decoder.GetNumComponents() == 3 &&

@@ -343,8 +290,9 @@

                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

                mjpeg_decoder.GetVertSampFactor(2) == 1 &&

                mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);

-    // YUV444

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,

+                                           dst_height);

+      // YUV444

     } else if (mjpeg_decoder.GetColorSpace() ==

                    MJpegDecoder::kColorSpaceYCbCr &&

                mjpeg_decoder.GetNumComponents() == 3 &&

@@ -354,28 +302,19 @@

                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

                mjpeg_decoder.GetVertSampFactor(2) == 1 &&

                mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);

-    // YUV411

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,

+                                           dst_height);

+      // YUV400

     } else if (mjpeg_decoder.GetColorSpace() ==

-                   MJpegDecoder::kColorSpaceYCbCr &&

-               mjpeg_decoder.GetNumComponents() == 3 &&

-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&

-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&

-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&

-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&

-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&

-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);

-    // YUV400

-    } else if (mjpeg_decoder.GetColorSpace() ==

                    MJpegDecoder::kColorSpaceGrayscale &&

                mjpeg_decoder.GetNumComponents() == 1 &&

                mjpeg_decoder.GetVertSampFactor(0) == 1 &&

                mjpeg_decoder.GetHorizSampFactor(0) == 1) {

-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);

+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,

+                                           dst_height);

     } else {

       // TODO(fbarchard): Implement conversion for any other colorspace/sample

-      // factors that occur in practice. 411 is supported by libjpeg

+      // factors that occur in practice.

       // ERROR: Unable to convert MJPEG frame because format is not supported

       mjpeg_decoder.UnloadFrame();

       return 1;

--- a/third_party/libyuv/source/convert_to_argb.cc

+++ b/third_party/libyuv/source/convert_to_argb.cc

@@ -28,36 +28,50 @@

 // src_height is used to compute location of planes, and indicate inversion

 // sample_size is measured in bytes and is the size of the frame.

 //   With MJPEG it is the compressed size of the frame.

+// TODO(fbarchard): Add the following:

+// H010ToARGB

+// H420ToARGB

+// H422ToARGB

+// I010ToARGB

+// J400ToARGB

+// J422ToARGB

+// J444ToARGB

 LIBYUV_API

-int ConvertToARGB(const uint8* sample, size_t sample_size,

-                  uint8* crop_argb, int argb_stride,

-                  int crop_x, int crop_y,

-                  int src_width, int src_height,

-                  int crop_width, int crop_height,

+int ConvertToARGB(const uint8_t* sample,

+                  size_t sample_size,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int crop_x,

+                  int crop_y,

+                  int src_width,

+                  int src_height,

+                  int crop_width,

+                  int crop_height,

                   enum RotationMode rotation,

-                  uint32 fourcc) {

-  uint32 format = CanonicalFourCC(fourcc);

+                  uint32_t fourcc) {

+  uint32_t format = CanonicalFourCC(fourcc);

   int aligned_src_width = (src_width + 1) & ~1;

-  const uint8* src;

-  const uint8* src_uv;

+  const uint8_t* src;

+  const uint8_t* src_uv;

   int abs_src_height = (src_height < 0) ? -src_height : src_height;

   int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;

   int r = 0;

   // One pass rotation is available for some formats. For the rest, convert

-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,

-  // and then rotate the I420 to the final destination buffer.

-  // For in-place conversion, if destination crop_argb is same as source sample,

+  // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,

+  // and then rotate the ARGB to the final destination buffer.

+  // For in-place conversion, if destination dst_argb is same as source sample,

   // also enable temporary buffer.

-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||

-      crop_argb == sample;

-  uint8* dest_argb = crop_argb;

-  int dest_argb_stride = argb_stride;

-  uint8* rotate_buffer = NULL;

+  LIBYUV_BOOL need_buf =

+      (rotation && format != FOURCC_ARGB) || dst_argb == sample;

+  uint8_t* dest_argb = dst_argb;

+  int dest_dst_stride_argb = dst_stride_argb;

+  uint8_t* rotate_buffer = NULL;

   int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

-  if (crop_argb == NULL || sample == NULL ||

-      src_width <= 0 || crop_width <= 0 ||

+  if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||

       src_height == 0 || crop_height == 0) {

     return -1;

@@ -67,12 +81,12 @@

   if (need_buf) {

     int argb_size = crop_width * 4 * abs_crop_height;

-    rotate_buffer = (uint8*)malloc(argb_size);

+    rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */

     if (!rotate_buffer) {

       return 1;  // Out of memory runtime error.

-    crop_argb = rotate_buffer;

-    argb_stride = crop_width * 4;

+    dst_argb = rotate_buffer;

+    dst_stride_argb = crop_width * 4;

   switch (format) {

@@ -79,175 +93,162 @@

     // Single plane formats

     case FOURCC_YUY2:

       src = sample + (aligned_src_width * crop_y + crop_x) * 2;

-      r = YUY2ToARGB(src, aligned_src_width * 2,

-                     crop_argb, argb_stride,

+      r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,

                      crop_width, inv_crop_height);

       break;

     case FOURCC_UYVY:

       src = sample + (aligned_src_width * crop_y + crop_x) * 2;

-      r = UYVYToARGB(src, aligned_src_width * 2,

-                     crop_argb, argb_stride,

+      r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,

                      crop_width, inv_crop_height);

       break;

     case FOURCC_24BG:

       src = sample + (src_width * crop_y + crop_x) * 3;

-      r = RGB24ToARGB(src, src_width * 3,

-                      crop_argb, argb_stride,

-                      crop_width, inv_crop_height);

+      r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,

+                      inv_crop_height);

       break;

     case FOURCC_RAW:

       src = sample + (src_width * crop_y + crop_x) * 3;

-      r = RAWToARGB(src, src_width * 3,

-                    crop_argb, argb_stride,

-                    crop_width, inv_crop_height);

+      r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,

+                    inv_crop_height);

       break;

     case FOURCC_ARGB:

-      src = sample + (src_width * crop_y + crop_x) * 4;

-      r = ARGBToARGB(src, src_width * 4,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      if (!need_buf && !rotation) {

+        src = sample + (src_width * crop_y + crop_x) * 4;

+        r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,

+                       crop_width, inv_crop_height);

+      }

       break;

     case FOURCC_BGRA:

       src = sample + (src_width * crop_y + crop_x) * 4;

-      r = BGRAToARGB(src, src_width * 4,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,

+                     inv_crop_height);

       break;

     case FOURCC_ABGR:

       src = sample + (src_width * crop_y + crop_x) * 4;

-      r = ABGRToARGB(src, src_width * 4,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,

+                     inv_crop_height);

       break;

     case FOURCC_RGBA:

       src = sample + (src_width * crop_y + crop_x) * 4;

-      r = RGBAToARGB(src, src_width * 4,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,

+                     inv_crop_height);

       break;

+    case FOURCC_AR30:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,

+                     inv_crop_height);

+      break;

+    case FOURCC_AB30:

+      src = sample + (src_width * crop_y + crop_x) * 4;

+      r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,

+                     inv_crop_height);

+      break;

     case FOURCC_RGBP:

       src = sample + (src_width * crop_y + crop_x) * 2;

-      r = RGB565ToARGB(src, src_width * 2,

-                       crop_argb, argb_stride,

+      r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,

                        crop_width, inv_crop_height);

       break;

     case FOURCC_RGBO:

       src = sample + (src_width * crop_y + crop_x) * 2;

-      r = ARGB1555ToARGB(src, src_width * 2,

-                         crop_argb, argb_stride,

+      r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,

                          crop_width, inv_crop_height);

       break;

     case FOURCC_R444:

       src = sample + (src_width * crop_y + crop_x) * 2;

-      r = ARGB4444ToARGB(src, src_width * 2,

-                         crop_argb, argb_stride,

+      r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,

                          crop_width, inv_crop_height);

       break;

     case FOURCC_I400:

       src = sample + src_width * crop_y + crop_x;

-      r = I400ToARGB(src, src_width,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,

+                     inv_crop_height);

       break;

     // Biplanar formats

     case FOURCC_NV12:

       src = sample + (src_width * crop_y + crop_x);

-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;

-      r = NV12ToARGB(src, src_width,

-                     src_uv, aligned_src_width,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;

+      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,

+                     dst_stride_argb, crop_width, inv_crop_height);

       break;

     case FOURCC_NV21:

       src = sample + (src_width * crop_y + crop_x);

-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;

+      src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;

       // Call NV12 but with u and v parameters swapped.

-      r = NV21ToARGB(src, src_width,

-                     src_uv, aligned_src_width,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,

+                     dst_stride_argb, crop_width, inv_crop_height);

       break;

     case FOURCC_M420:

       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;

-      r = M420ToARGB(src, src_width,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,

+                     inv_crop_height);

       break;

     // Triplanar formats

     case FOURCC_I420:

     case FOURCC_YV12: {

-      const uint8* src_y = sample + (src_width * crop_y + crop_x);

-      const uint8* src_u;

-      const uint8* src_v;

+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);

+      const uint8_t* src_u;

+      const uint8_t* src_v;

       int halfwidth = (src_width + 1) / 2;

       int halfheight = (abs_src_height + 1) / 2;

       if (format == FOURCC_YV12) {

         src_v = sample + src_width * abs_src_height +

-            (halfwidth * crop_y + crop_x) / 2;

+                (halfwidth * crop_y + crop_x) / 2;

         src_u = sample + src_width * abs_src_height +

-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

       } else {

         src_u = sample + src_width * abs_src_height +

-            (halfwidth * crop_y + crop_x) / 2;

+                (halfwidth * crop_y + crop_x) / 2;

         src_v = sample + src_width * abs_src_height +

-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

-      r = I420ToARGB(src_y, src_width,

-                     src_u, halfwidth,

-                     src_v, halfwidth,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,

+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);

       break;

     case FOURCC_J420: {

-      const uint8* src_y = sample + (src_width * crop_y + crop_x);

-      const uint8* src_u;

-      const uint8* src_v;

+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);

+      const uint8_t* src_u;

+      const uint8_t* src_v;

       int halfwidth = (src_width + 1) / 2;

       int halfheight = (abs_src_height + 1) / 2;

       src_u = sample + src_width * abs_src_height +

-          (halfwidth * crop_y + crop_x) / 2;

+              (halfwidth * crop_y + crop_x) / 2;

       src_v = sample + src_width * abs_src_height +

-          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

-      r = J420ToARGB(src_y, src_width,

-                     src_u, halfwidth,

-                     src_v, halfwidth,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+              halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,

+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);

       break;

     case FOURCC_I422:

     case FOURCC_YV16: {

-      const uint8* src_y = sample + src_width * crop_y + crop_x;

-      const uint8* src_u;

-      const uint8* src_v;

+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;

+      const uint8_t* src_u;

+      const uint8_t* src_v;

       int halfwidth = (src_width + 1) / 2;

       if (format == FOURCC_YV16) {

-        src_v = sample + src_width * abs_src_height +

-            halfwidth * crop_y + crop_x / 2;

+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +

+                crop_x / 2;

         src_u = sample + src_width * abs_src_height +

-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;

       } else {

-        src_u = sample + src_width * abs_src_height +

-            halfwidth * crop_y + crop_x / 2;

+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +

+                crop_x / 2;

         src_v = sample + src_width * abs_src_height +

-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;

-      r = I422ToARGB(src_y, src_width,

-                     src_u, halfwidth,

-                     src_v, halfwidth,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,

+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);

       break;

     case FOURCC_I444:

     case FOURCC_YV24: {

-      const uint8* src_y = sample + src_width * crop_y + crop_x;

-      const uint8* src_u;

-      const uint8* src_v;

+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;

+      const uint8_t* src_u;

+      const uint8_t* src_v;

       if (format == FOURCC_YV24) {

         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;

         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

@@ -255,32 +256,14 @@

         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;

         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

-      r = I444ToARGB(src_y, src_width,

-                     src_u, src_width,

-                     src_v, src_width,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

+      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,

+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);

       break;

-    case FOURCC_I411: {

-      int quarterwidth = (src_width + 3) / 4;

-      const uint8* src_y = sample + src_width * crop_y + crop_x;

-      const uint8* src_u = sample + src_width * abs_src_height +

-          quarterwidth * crop_y + crop_x / 4;

-      const uint8* src_v = sample + src_width * abs_src_height +

-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;

-      r = I411ToARGB(src_y, src_width,

-                     src_u, quarterwidth,

-                     src_v, quarterwidth,

-                     crop_argb, argb_stride,

-                     crop_width, inv_crop_height);

-      break;

-    }

 #ifdef HAVE_JPEG

     case FOURCC_MJPG:

-      r = MJPGToARGB(sample, sample_size,

-                     crop_argb, argb_stride,

-                     src_width, abs_src_height, crop_width, inv_crop_height);

+      r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,

+                     abs_src_height, crop_width, inv_crop_height);

       break;

 #endif

     default:

@@ -289,11 +272,14 @@

   if (need_buf) {

     if (!r) {

-      r = ARGBRotate(crop_argb, argb_stride,

-                     dest_argb, dest_argb_stride,

+      r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,

                      crop_width, abs_crop_height, rotation);

     free(rotate_buffer);

+  } else if (rotation) {

+    src = sample + (src_width * crop_y + crop_x) * 4;

+    r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,

+                   inv_crop_height, rotation);

   return r;

--- a/third_party/libyuv/source/convert_to_i420.cc

+++ b/third_party/libyuv/source/convert_to_i420.cc

@@ -25,40 +25,46 @@

 // sample_size is measured in bytes and is the size of the frame.

 //   With MJPEG it is the compressed size of the frame.

 LIBYUV_API

-int ConvertToI420(const uint8* sample,

+int ConvertToI420(const uint8_t* sample,

                   size_t sample_size,

-                  uint8* y, int y_stride,

-                  uint8* u, int u_stride,

-                  uint8* v, int v_stride,

-                  int crop_x, int crop_y,

-                  int src_width, int src_height,

-                  int crop_width, int crop_height,

+                  uint8_t* dst_y,

+                  int dst_stride_y,

+                  uint8_t* dst_u,

+                  int dst_stride_u,

+                  uint8_t* dst_v,

+                  int dst_stride_v,

+                  int crop_x,

+                  int crop_y,

+                  int src_width,

+                  int src_height,

+                  int crop_width,

+                  int crop_height,

                   enum RotationMode rotation,

-                  uint32 fourcc) {

-  uint32 format = CanonicalFourCC(fourcc);

+                  uint32_t fourcc) {

+  uint32_t format = CanonicalFourCC(fourcc);

   int aligned_src_width = (src_width + 1) & ~1;

-  const uint8* src;

-  const uint8* src_uv;

+  const uint8_t* src;

+  const uint8_t* src_uv;

   const int abs_src_height = (src_height < 0) ? -src_height : src_height;

   // TODO(nisse): Why allow crop_height < 0?

   const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;

   int r = 0;

-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&

-      format != FOURCC_NV12 && format != FOURCC_NV21 &&

-      format != FOURCC_YV12) || y == sample;

-  uint8* tmp_y = y;

-  uint8* tmp_u = u;

-  uint8* tmp_v = v;

-  int tmp_y_stride = y_stride;

-  int tmp_u_stride = u_stride;

-  int tmp_v_stride = v_stride;

-  uint8* rotate_buffer = NULL;

+  LIBYUV_BOOL need_buf =

+      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&

+       format != FOURCC_NV21 && format != FOURCC_YV12) ||

+      dst_y == sample;

+  uint8_t* tmp_y = dst_y;

+  uint8_t* tmp_u = dst_u;

+  uint8_t* tmp_v = dst_v;

+  int tmp_y_stride = dst_stride_y;

+  int tmp_u_stride = dst_stride_u;

+  int tmp_v_stride = dst_stride_v;

+  uint8_t* rotate_buffer = NULL;

   const int inv_crop_height =

       (src_height < 0) ? -abs_crop_height : abs_crop_height;

-  if (!y || !u || !v || !sample ||

-      src_width <= 0 || crop_width <= 0  ||

-      src_height == 0 || crop_height == 0) {

+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||

+      crop_width <= 0 || src_height == 0 || crop_height == 0) {

     return -1;

@@ -65,20 +71,20 @@

   // One pass rotation is available for some formats. For the rest, convert

   // to I420 (with optional vertical flipping) into a temporary I420 buffer,

   // and then rotate the I420 to the final destination buffer.

-  // For in-place conversion, if destination y is same as source sample,

+  // For in-place conversion, if destination dst_y is same as source sample,

   // also enable temporary buffer.

   if (need_buf) {

     int y_size = crop_width * abs_crop_height;

     int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);

-    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);

+    rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */

     if (!rotate_buffer) {

       return 1;  // Out of memory runtime error.

-    y = rotate_buffer;

-    u = y + y_size;

-    v = u + uv_size;

-    y_stride = crop_width;

-    u_stride = v_stride = ((crop_width + 1) / 2);

+    dst_y = rotate_buffer;

+    dst_u = dst_y + y_size;

+    dst_v = dst_u + uv_size;

+    dst_stride_y = crop_width;

+    dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);

   switch (format) {

@@ -85,191 +91,150 @@

     // Single plane formats

     case FOURCC_YUY2:

       src = sample + (aligned_src_width * crop_y + crop_x) * 2;

-      r = YUY2ToI420(src, aligned_src_width * 2,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,

+                     dst_stride_u, dst_v, dst_stride_v, crop_width,

+                     inv_crop_height);

       break;

     case FOURCC_UYVY:

       src = sample + (aligned_src_width * crop_y + crop_x) * 2;

-      r = UYVYToI420(src, aligned_src_width * 2,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,

+                     dst_stride_u, dst_v, dst_stride_v, crop_width,

+                     inv_crop_height);

       break;

     case FOURCC_RGBP:

       src = sample + (src_width * crop_y + crop_x) * 2;

-      r = RGB565ToI420(src, src_width * 2,

-                       y, y_stride,

-                       u, u_stride,

-                       v, v_stride,

-                       crop_width, inv_crop_height);

+      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,

+                       dst_stride_u, dst_v, dst_stride_v, crop_width,

+                       inv_crop_height);

       break;

     case FOURCC_RGBO:

       src = sample + (src_width * crop_y + crop_x) * 2;

-      r = ARGB1555ToI420(src, src_width * 2,

-                         y, y_stride,

-                         u, u_stride,

-                         v, v_stride,

-                         crop_width, inv_crop_height);

+      r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,

+                         dst_stride_u, dst_v, dst_stride_v, crop_width,

+                         inv_crop_height);

       break;

     case FOURCC_R444:

       src = sample + (src_width * crop_y + crop_x) * 2;

-      r = ARGB4444ToI420(src, src_width * 2,

-                         y, y_stride,

-                         u, u_stride,

-                         v, v_stride,

-                         crop_width, inv_crop_height);

+      r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,

+                         dst_stride_u, dst_v, dst_stride_v, crop_width,

+                         inv_crop_height);

       break;

     case FOURCC_24BG:

       src = sample + (src_width * crop_y + crop_x) * 3;

-      r = RGB24ToI420(src, src_width * 3,

-                      y, y_stride,

-                      u, u_stride,

-                      v, v_stride,

-                      crop_width, inv_crop_height);

+      r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,

+                      dst_stride_u, dst_v, dst_stride_v, crop_width,

+                      inv_crop_height);

       break;

     case FOURCC_RAW:

       src = sample + (src_width * crop_y + crop_x) * 3;

-      r = RAWToI420(src, src_width * 3,

-                    y, y_stride,

-                    u, u_stride,

-                    v, v_stride,

-                    crop_width, inv_crop_height);

+      r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,

+                    dst_stride_u, dst_v, dst_stride_v, crop_width,

+                    inv_crop_height);

       break;

     case FOURCC_ARGB:

       src = sample + (src_width * crop_y + crop_x) * 4;

-      r = ARGBToI420(src, src_width * 4,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,

+                     dst_stride_u, dst_v, dst_stride_v, crop_width,

+                     inv_crop_height);

       break;

     case FOURCC_BGRA:

       src = sample + (src_width * crop_y + crop_x) * 4;

-      r = BGRAToI420(src, src_width * 4,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,

+                     dst_stride_u, dst_v, dst_stride_v, crop_width,

+                     inv_crop_height);

       break;

     case FOURCC_ABGR:

       src = sample + (src_width * crop_y + crop_x) * 4;

-      r = ABGRToI420(src, src_width * 4,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,

+                     dst_stride_u, dst_v, dst_stride_v, crop_width,

+                     inv_crop_height);

       break;

     case FOURCC_RGBA:

       src = sample + (src_width * crop_y + crop_x) * 4;

-      r = RGBAToI420(src, src_width * 4,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,

+                     dst_stride_u, dst_v, dst_stride_v, crop_width,

+                     inv_crop_height);

       break;

+    // TODO(fbarchard): Add AR30 and AB30

     case FOURCC_I400:

       src = sample + src_width * crop_y + crop_x;

-      r = I400ToI420(src, src_width,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,

+                     dst_v, dst_stride_v, crop_width, inv_crop_height);

       break;

     // Biplanar formats

     case FOURCC_NV12:

       src = sample + (src_width * crop_y + crop_x);

-      src_uv = sample + (src_width * src_height) +

-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);

-      r = NV12ToI420Rotate(src, src_width,

-                           src_uv, aligned_src_width,

-                           y, y_stride,

-                           u, u_stride,

-                           v, v_stride,

-                           crop_width, inv_crop_height, rotation);

+      src_uv = sample + (src_width * abs_src_height) +

+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);

+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,

+                           dst_stride_y, dst_u, dst_stride_u, dst_v,

+                           dst_stride_v, crop_width, inv_crop_height, rotation);

       break;

     case FOURCC_NV21:

       src = sample + (src_width * crop_y + crop_x);

-      src_uv = sample + (src_width * src_height) +

-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);

-      // Call NV12 but with u and v parameters swapped.

-      r = NV12ToI420Rotate(src, src_width,

-                           src_uv, aligned_src_width,

-                           y, y_stride,

-                           v, v_stride,

-                           u, u_stride,

-                           crop_width, inv_crop_height, rotation);

+      src_uv = sample + (src_width * abs_src_height) +

+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);

+      // Call NV12 but with dst_u and dst_v parameters swapped.

+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,

+                           dst_stride_y, dst_v, dst_stride_v, dst_u,

+                           dst_stride_u, crop_width, inv_crop_height, rotation);

       break;

     case FOURCC_M420:

       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;

-      r = M420ToI420(src, src_width,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,

+                     dst_v, dst_stride_v, crop_width, inv_crop_height);

       break;

     // Triplanar formats

     case FOURCC_I420:

     case FOURCC_YV12: {

-      const uint8* src_y = sample + (src_width * crop_y + crop_x);

-      const uint8* src_u;

-      const uint8* src_v;

+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);

+      const uint8_t* src_u;

+      const uint8_t* src_v;

       int halfwidth = (src_width + 1) / 2;

       int halfheight = (abs_src_height + 1) / 2;

       if (format == FOURCC_YV12) {

         src_v = sample + src_width * abs_src_height +

-            (halfwidth * crop_y + crop_x) / 2;

+                (halfwidth * crop_y + crop_x) / 2;

         src_u = sample + src_width * abs_src_height +

-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

       } else {

         src_u = sample + src_width * abs_src_height +

-            (halfwidth * crop_y + crop_x) / 2;

+                (halfwidth * crop_y + crop_x) / 2;

         src_v = sample + src_width * abs_src_height +

-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;

-      r = I420Rotate(src_y, src_width,

-                     src_u, halfwidth,

-                     src_v, halfwidth,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height, rotation);

+      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,

+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,

+                     dst_stride_v, crop_width, inv_crop_height, rotation);

       break;

     case FOURCC_I422:

     case FOURCC_YV16: {

-      const uint8* src_y = sample + src_width * crop_y + crop_x;

-      const uint8* src_u;

-      const uint8* src_v;

+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;

+      const uint8_t* src_u;

+      const uint8_t* src_v;

       int halfwidth = (src_width + 1) / 2;

       if (format == FOURCC_YV16) {

-        src_v = sample + src_width * abs_src_height +

-            halfwidth * crop_y + crop_x / 2;

+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +

+                crop_x / 2;

         src_u = sample + src_width * abs_src_height +

-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;

       } else {

-        src_u = sample + src_width * abs_src_height +

-            halfwidth * crop_y + crop_x / 2;

+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +

+                crop_x / 2;

         src_v = sample + src_width * abs_src_height +

-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;

+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;

-      r = I422ToI420(src_y, src_width,

-                     src_u, halfwidth,

-                     src_v, halfwidth,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,

+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,

+                     dst_stride_v, crop_width, inv_crop_height);

       break;

     case FOURCC_I444:

     case FOURCC_YV24: {

-      const uint8* src_y = sample + src_width * crop_y + crop_x;

-      const uint8* src_u;

-      const uint8* src_v;

+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;

+      const uint8_t* src_u;

+      const uint8_t* src_v;

       if (format == FOURCC_YV24) {

         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;

         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

@@ -277,38 +242,16 @@

         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;

         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;

-      r = I444ToI420(src_y, src_width,

-                     src_u, src_width,

-                     src_v, src_width,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

+      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,

+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,

+                     dst_stride_v, crop_width, inv_crop_height);

       break;

-    case FOURCC_I411: {

-      int quarterwidth = (src_width + 3) / 4;

-      const uint8* src_y = sample + src_width * crop_y + crop_x;

-      const uint8* src_u = sample + src_width * abs_src_height +

-          quarterwidth * crop_y + crop_x / 4;

-      const uint8* src_v = sample + src_width * abs_src_height +

-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;

-      r = I411ToI420(src_y, src_width,

-                     src_u, quarterwidth,

-                     src_v, quarterwidth,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     crop_width, inv_crop_height);

-      break;

-    }

 #ifdef HAVE_JPEG

     case FOURCC_MJPG:

-      r = MJPGToI420(sample, sample_size,

-                     y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     src_width, abs_src_height, crop_width, inv_crop_height);

+      r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,

+                     dst_stride_u, dst_v, dst_stride_v, src_width,

+                     abs_src_height, crop_width, inv_crop_height);

       break;

 #endif

     default:

@@ -317,13 +260,10 @@

   if (need_buf) {

     if (!r) {

-      r = I420Rotate(y, y_stride,

-                     u, u_stride,

-                     v, v_stride,

-                     tmp_y, tmp_y_stride,

-                     tmp_u, tmp_u_stride,

-                     tmp_v, tmp_v_stride,

-                     crop_width, abs_crop_height, rotation);

+      r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,

+                     dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,

+                     tmp_v, tmp_v_stride, crop_width, abs_crop_height,

+                     rotation);

     free(rotate_buffer);

--- a/third_party/libyuv/source/cpu_id.cc

+++ b/third_party/libyuv/source/cpu_id.cc

@@ -13,22 +13,16 @@

 #if defined(_MSC_VER)

 #include <intrin.h>  // For __cpuidex()

 #endif

-#if !defined(__pnacl__) && !defined(__CLR_VER) && \

+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                           \

     !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \

     defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)

 #include <immintrin.h>  // For _xgetbv()

 #endif

-#if !defined(__native_client__)

-#include <stdlib.h>  // For getenv()

-#endif

 // For ArmCpuCaps() but unittested on all platforms

 #include <stdio.h>

 #include <string.h>

-#include "libyuv/basic_types.h"  // For CPU_X86

 #ifdef __cplusplus

 namespace libyuv {

 extern "C" {

@@ -43,16 +37,20 @@

 #define SAFEBUFFERS

 #endif

+// cpu_info_ variable for SIMD instruction sets detected.

+LIBYUV_API int cpu_info_ = 0;

+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.

 // Low level cpuid for X86.

-#if (defined(_M_IX86) || defined(_M_X64) || \

-    defined(__i386__) || defined(__x86_64__)) && \

+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \

+     defined(__x86_64__)) &&                                     \

     !defined(__pnacl__) && !defined(__CLR_VER)

 LIBYUV_API

-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {

+void CpuId(int info_eax, int info_ecx, int* cpu_info) {

 #if defined(_MSC_VER)

 // Visual C version uses intrinsic or inline x86 assembly.

 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)

-  __cpuidex((int*)(cpu_info), info_eax, info_ecx);

+  __cpuidex(cpu_info, info_eax, info_ecx);

 #elif defined(_M_IX86)

   __asm {

     mov        eax, info_eax

@@ -66,26 +64,26 @@

 #else  // Visual C but not x86

   if (info_ecx == 0) {

-    __cpuid((int*)(cpu_info), info_eax);

+    __cpuid(cpu_info, info_eax);

   } else {

-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;

+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;

 #endif

 // GCC version uses inline x86 assembly.

 #else  // defined(_MSC_VER)

-  uint32 info_ebx, info_edx;

-  asm volatile (

-#if defined( __i386__) && defined(__PIC__)

-    // Preserve ebx for fpic 32 bit.

-    "mov %%ebx, %%edi                          \n"

-    "cpuid                                     \n"

-    "xchg %%edi, %%ebx                         \n"

-    : "=D" (info_ebx),

+  int info_ebx, info_edx;

+  asm volatile(

+#if defined(__i386__) && defined(__PIC__)

+      // Preserve ebx for fpic 32 bit.

+      "mov %%ebx, %%edi                          \n"

+      "cpuid                                     \n"

+      "xchg %%edi, %%ebx                         \n"

+      : "=D"(info_ebx),

 #else

-    "cpuid                                     \n"

-    : "=b" (info_ebx),

+      "cpuid                                     \n"

+      : "=b"(info_ebx),

 #endif  //  defined( __i386__) && defined(__PIC__)

-      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));

+        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));

   cpu_info[0] = info_eax;

   cpu_info[1] = info_ebx;

   cpu_info[2] = info_ecx;

@@ -94,7 +92,9 @@

 #else  // (defined(_M_IX86) || defined(_M_X64) ...

 LIBYUV_API

-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {

+void CpuId(int eax, int ecx, int* cpu_info) {

+  (void)eax;

+  (void)ecx;

   cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;

 #endif

@@ -111,20 +111,22 @@

 #if defined(_M_IX86) && (_MSC_VER < 1900)

 #pragma optimize("g", off)

 #endif

-#if (defined(_M_IX86) || defined(_M_X64) || \

-    defined(__i386__) || defined(__x86_64__)) && \

+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \

+     defined(__x86_64__)) &&                                     \

     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)

-#define HAS_XGETBV

 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.

 int GetXCR0() {

-  uint32 xcr0 = 0u;

+  int xcr0 = 0;

 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)

-  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.

+  xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT

 #elif defined(__i386__) || defined(__x86_64__)

-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");

+  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");

 #endif  // defined(__i386__) || defined(__x86_64__)

   return xcr0;

+#else

+// xgetbv unavailable to query for OSSave support.  Return 0.

+#define GetXCR0() 0

 #endif  // defined(_M_IX86) || defined(_M_X64) ..

 // Return optimization to previous setting.

 #if defined(_M_IX86) && (_MSC_VER < 1900)

@@ -133,8 +135,7 @@

 // based on libvpx arm_cpudetect.c

 // For Arm, but public to allow testing on any CPU

-LIBYUV_API SAFEBUFFERS

-int ArmCpuCaps(const char* cpuinfo_name) {

+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {

   char cpuinfo_line[512];

   FILE* f = fopen(cpuinfo_name, "r");

   if (!f) {

@@ -151,7 +152,7 @@

       // aarch64 uses asimd for Neon.

       p = strstr(cpuinfo_line, " asimd");

-      if (p && (p[6] == ' ' || p[6] == '\n')) {

+      if (p) {

         fclose(f);

         return kCpuHasNEON;

@@ -161,103 +162,78 @@

   return 0;

-// CPU detect function for SIMD instruction sets.

-LIBYUV_API

-int cpu_info_ = 0;  // cpu_info is not initialized yet.

-// Test environment variable for disabling CPU features. Any non-zero value

-// to disable. Zero ignored to make it easy to set the variable on/off.

-#if !defined(__native_client__) && !defined(_M_ARM)

-static LIBYUV_BOOL TestEnv(const char* name) {

-  const char* var = getenv(name);

-  if (var) {

-    if (var[0] != '0') {

-      return LIBYUV_TRUE;

+// TODO(fbarchard): Consider read_msa_ir().

+// TODO(fbarchard): Add unittest.

+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,

+                                       const char ase[]) {

+  char cpuinfo_line[512];

+  FILE* f = fopen(cpuinfo_name, "r");

+  if (!f) {

+    // ase enabled if /proc/cpuinfo is unavailable.

+    if (strcmp(ase, " msa") == 0) {

+      return kCpuHasMSA;

+    return 0;

-  return LIBYUV_FALSE;

+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {

+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {

+      char* p = strstr(cpuinfo_line, ase);

+      if (p) {

+        fclose(f);

+        if (strcmp(ase, " msa") == 0) {

+          return kCpuHasMSA;

+        }

+        return 0;

+      }

+    }

+  }

+  fclose(f);

+  return 0;

-#else  // nacl does not support getenv().

-static LIBYUV_BOOL TestEnv(const char*) {

-  return LIBYUV_FALSE;

-}

-#endif

-LIBYUV_API SAFEBUFFERS

-int InitCpuFlags(void) {

-  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.

+static SAFEBUFFERS int GetCpuFlags(void) {

   int cpu_info = 0;

-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)

-  uint32 cpu_info0[4] = { 0, 0, 0, 0 };

-  uint32 cpu_info1[4] = { 0, 0, 0, 0 };

-  uint32 cpu_info7[4] = { 0, 0, 0, 0 };

+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \

+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \

+     defined(_M_IX86))

+  int cpu_info0[4] = {0, 0, 0, 0};

+  int cpu_info1[4] = {0, 0, 0, 0};

+  int cpu_info7[4] = {0, 0, 0, 0};

   CpuId(0, 0, cpu_info0);

   CpuId(1, 0, cpu_info1);

   if (cpu_info0[0] >= 7) {

     CpuId(7, 0, cpu_info7);

-  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |

+  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |

              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |

              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |

              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |

-             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |

-             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |

-             kCpuHasX86;

+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);

-#ifdef HAS_XGETBV

-  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv

+  // AVX requires OS saves YMM registers.

   if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave

       ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers

-    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;

+    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |

+                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |

+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);

     // Detect AVX512bw

     if ((GetXCR0() & 0xe0) == 0xe0) {

-      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;

+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;

+      cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;

+      cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;

+      cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;

+      cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;

+      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;

+      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;

 #endif

-  // Environment variable overrides for testing.

-  if (TestEnv("LIBYUV_DISABLE_X86")) {

-    cpu_info &= ~kCpuHasX86;

-  }

-  if (TestEnv("LIBYUV_DISABLE_SSE2")) {

-    cpu_info &= ~kCpuHasSSE2;

-  }

-  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {

-    cpu_info &= ~kCpuHasSSSE3;

-  }

-  if (TestEnv("LIBYUV_DISABLE_SSE41")) {

-    cpu_info &= ~kCpuHasSSE41;

-  }

-  if (TestEnv("LIBYUV_DISABLE_SSE42")) {

-    cpu_info &= ~kCpuHasSSE42;

-  }

-  if (TestEnv("LIBYUV_DISABLE_AVX")) {

-    cpu_info &= ~kCpuHasAVX;

-  }

-  if (TestEnv("LIBYUV_DISABLE_AVX2")) {

-    cpu_info &= ~kCpuHasAVX2;

-  }

-  if (TestEnv("LIBYUV_DISABLE_ERMS")) {

-    cpu_info &= ~kCpuHasERMS;

-  }

-  if (TestEnv("LIBYUV_DISABLE_FMA3")) {

-    cpu_info &= ~kCpuHasFMA3;

-  }

-  if (TestEnv("LIBYUV_DISABLE_AVX3")) {

-    cpu_info &= ~kCpuHasAVX3;

-  }

-#endif

 #if defined(__mips__) && defined(__linux__)

-#if defined(__mips_dspr2)

-  cpu_info |= kCpuHasDSPR2;

+#if defined(__mips_msa)

+  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");

 #endif

   cpu_info |= kCpuHasMIPS;

-  if (getenv("LIBYUV_DISABLE_DSPR2")) {

-    cpu_info &= ~kCpuHasDSPR2;

-  }

 #endif

 #if defined(__arm__) || defined(__aarch64__)

 // gcc -mfpu=neon defines __ARM_NEON__

@@ -276,22 +252,22 @@

   cpu_info = ArmCpuCaps("/proc/cpuinfo");

 #endif

   cpu_info |= kCpuHasARM;

-  if (TestEnv("LIBYUV_DISABLE_NEON")) {

-    cpu_info &= ~kCpuHasNEON;

-  }

 #endif  // __arm__

-  if (TestEnv("LIBYUV_DISABLE_ASM")) {

-    cpu_info = 0;

-  }

-  cpu_info  |= kCpuInitialized;

-  cpu_info_ = cpu_info;

+  cpu_info |= kCpuInitialized;

   return cpu_info;

 // Note that use of this function is not thread safe.

 LIBYUV_API

-void MaskCpuFlags(int enable_flags) {

-  cpu_info_ = InitCpuFlags() & enable_flags;

+int MaskCpuFlags(int enable_flags) {

+  int cpu_info = GetCpuFlags() & enable_flags;

+  SetCpuFlags(cpu_info);

+  return cpu_info;

+}

+LIBYUV_API

+int InitCpuFlags(void) {

+  return MaskCpuFlags(-1);

 #ifdef __cplusplus

--- a/third_party/libyuv/source/mjpeg_decoder.cc

+++ b/third_party/libyuv/source/mjpeg_decoder.cc

@@ -21,7 +21,7 @@

 #if defined(_MSC_VER)

 // disable warning 4324: structure was padded due to __declspec(align())

-#pragma warning(disable:4324)

+#pragma warning(disable : 4324)

 #endif

 #endif

@@ -102,7 +102,7 @@

   DestroyOutputBuffers();

-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {

+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {

   if (!ValidateJpeg(src, src_len)) {

     return LIBYUV_FALSE;

@@ -129,7 +129,7 @@

       if (scanlines_[i]) {

         delete scanlines_[i];

-      scanlines_[i] = new uint8* [scanlines_size];

+      scanlines_[i] = new uint8_t*[scanlines_size];

       scanlines_sizes_[i] = scanlines_size;

@@ -145,7 +145,7 @@

       if (databuf_[i]) {

         delete databuf_[i];

-      databuf_[i] = new uint8[databuf_size];

+      databuf_[i] = new uint8_t[databuf_size];

       databuf_strides_[i] = databuf_stride;

@@ -195,13 +195,11 @@

 int MJpegDecoder::GetHorizSubSampFactor(int component) {

-  return decompress_struct_->max_h_samp_factor /

-      GetHorizSampFactor(component);

+  return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);

 int MJpegDecoder::GetVertSubSampFactor(int component) {

-  return decompress_struct_->max_v_samp_factor /

-      GetVertSampFactor(component);

+  return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);

 int MJpegDecoder::GetImageScanlinesPerImcuRow() {

@@ -245,10 +243,10 @@

 // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.

-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(

-    uint8** planes, int dst_width, int dst_height) {

-  if (dst_width != GetWidth() ||

-      dst_height > GetHeight()) {

+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,

+                                          int dst_width,

+                                          int dst_height) {

+  if (dst_width != GetWidth() || dst_height > GetHeight()) {

     // ERROR: Bad dimensions

     return LIBYUV_FALSE;

@@ -289,14 +287,13 @@

       for (int i = 0; i < num_outbufs_; ++i) {

         // TODO(fbarchard): Compute skip to avoid this

         assert(skip % GetVertSubSampFactor(i) == 0);

-        int rows_to_skip =

-            DivideAndRoundDown(skip, GetVertSubSampFactor(i));

-        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -

-                                rows_to_skip;

+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));

+        int scanlines_to_copy =

+            GetComponentScanlinesPerImcuRow(i) - rows_to_skip;

         int data_to_skip = rows_to_skip * GetComponentStride(i);

-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),

-                  planes[i], GetComponentWidth(i),

-                  GetComponentWidth(i), scanlines_to_copy);

+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],

+                  GetComponentWidth(i), GetComponentWidth(i),

+                  scanlines_to_copy);

         planes[i] += scanlines_to_copy * GetComponentWidth(i);

       lines_left -= (GetImageScanlinesPerImcuRow() - skip);

@@ -305,7 +302,7 @@

   // Read full MCUs but cropped horizontally

   for (; lines_left > GetImageScanlinesPerImcuRow();

-         lines_left -= GetImageScanlinesPerImcuRow()) {

+       lines_left -= GetImageScanlinesPerImcuRow()) {

     if (!DecodeImcuRow()) {

       FinishDecode();

       return LIBYUV_FALSE;

@@ -312,9 +309,8 @@

     for (int i = 0; i < num_outbufs_; ++i) {

       int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);

-      CopyPlane(databuf_[i], GetComponentStride(i),

-                planes[i], GetComponentWidth(i),

-                GetComponentWidth(i), scanlines_to_copy);

+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],

+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);

       planes[i] += scanlines_to_copy * GetComponentWidth(i);

@@ -328,9 +324,8 @@

     for (int i = 0; i < num_outbufs_; ++i) {

       int scanlines_to_copy =

           DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));

-      CopyPlane(databuf_[i], GetComponentStride(i),

-                planes[i], GetComponentWidth(i),

-                GetComponentWidth(i), scanlines_to_copy);

+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],

+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);

       planes[i] += scanlines_to_copy * GetComponentWidth(i);

@@ -337,10 +332,11 @@

   return FinishDecode();

-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,

-    int dst_width, int dst_height) {

-  if (dst_width != GetWidth() ||

-      dst_height > GetHeight()) {

+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,

+                                           void* opaque,

+                                           int dst_width,

+                                           int dst_height) {

+  if (dst_width != GetWidth() || dst_height > GetHeight()) {

     // ERROR: Bad dimensions

     return LIBYUV_FALSE;

@@ -395,7 +391,7 @@

   // Read full MCUs until we get to the crop point.

   for (; lines_left >= GetImageScanlinesPerImcuRow();

-         lines_left -= GetImageScanlinesPerImcuRow()) {

+       lines_left -= GetImageScanlinesPerImcuRow()) {

     if (!DecodeImcuRow()) {

       FinishDecode();

       return LIBYUV_FALSE;

@@ -435,22 +431,22 @@

 void term_source(j_decompress_ptr cinfo) {

-  // Nothing to do.

+  (void)cinfo;  // Nothing to do.

 #ifdef HAVE_SETJMP

 void ErrorHandler(j_common_ptr cinfo) {

-  // This is called when a jpeglib command experiences an error. Unfortunately

-  // jpeglib's error handling model is not very flexible, because it expects the

-  // error handler to not return--i.e., it wants the program to terminate. To

-  // recover from errors we use setjmp() as shown in their example. setjmp() is

-  // C's implementation for the "call with current continuation" functionality

-  // seen in some functional programming languages.

-  // A formatted message can be output, but is unsafe for release.

+// This is called when a jpeglib command experiences an error. Unfortunately

+// jpeglib's error handling model is not very flexible, because it expects the

+// error handler to not return--i.e., it wants the program to terminate. To

+// recover from errors we use setjmp() as shown in their example. setjmp() is

+// C's implementation for the "call with current continuation" functionality

+// seen in some functional programming languages.

+// A formatted message can be output, but is unsafe for release.

 #ifdef DEBUG

   char buf[JMSG_LENGTH_MAX];

   (*cinfo->err->format_message)(cinfo, buf);

-  // ERROR: Error in jpeglib: buf

+// ERROR: Error in jpeglib: buf

 #endif

   SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);

@@ -459,8 +455,9 @@

   longjmp(mgr->setjmp_buffer, 1);

+// Suppress fprintf warnings.

 void OutputHandler(j_common_ptr cinfo) {

-  // Suppress fprintf warnings.

+  (void)cinfo;

 #endif  // HAVE_SETJMP

@@ -472,9 +469,9 @@

     // it.

     DestroyOutputBuffers();

-    scanlines_ = new uint8** [num_outbufs];

+    scanlines_ = new uint8_t**[num_outbufs];

     scanlines_sizes_ = new int[num_outbufs];

-    databuf_ = new uint8* [num_outbufs];

+    databuf_ = new uint8_t*[num_outbufs];

     databuf_strides_ = new int[num_outbufs];

     for (int i = 0; i < num_outbufs; ++i) {

@@ -490,13 +487,13 @@

 void MJpegDecoder::DestroyOutputBuffers() {

   for (int i = 0; i < num_outbufs_; ++i) {

-    delete [] scanlines_[i];

-    delete [] databuf_[i];

+    delete[] scanlines_[i];

+    delete[] databuf_[i];

-  delete [] scanlines_;

-  delete [] databuf_;

-  delete [] scanlines_sizes_;

-  delete [] databuf_strides_;

+  delete[] scanlines_;

+  delete[] databuf_;

+  delete[] scanlines_sizes_;

+  delete[] databuf_strides_;

   scanlines_ = NULL;

   databuf_ = NULL;

   scanlines_sizes_ = NULL;

@@ -530,9 +527,9 @@

   return LIBYUV_TRUE;

-void MJpegDecoder::SetScanlinePointers(uint8** data) {

+void MJpegDecoder::SetScanlinePointers(uint8_t** data) {

   for (int i = 0; i < num_outbufs_; ++i) {

-    uint8* data_i = data[i];

+    uint8_t* data_i = data[i];

     for (int j = 0; j < scanlines_sizes_[i]; ++j) {

       scanlines_[i][j] = data_i;

       data_i += GetComponentStride(i);

@@ -542,26 +539,26 @@

 inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {

   return (unsigned int)(GetImageScanlinesPerImcuRow()) ==

-      jpeg_read_raw_data(decompress_struct_,

-                         scanlines_,

-                         GetImageScanlinesPerImcuRow());

+         jpeg_read_raw_data(decompress_struct_, scanlines_,

+                            GetImageScanlinesPerImcuRow());

 // The helper function which recognizes the jpeg sub-sampling type.

 JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(

-    int* subsample_x, int* subsample_y, int number_of_components) {

+    int* subsample_x,

+    int* subsample_y,

+    int number_of_components) {

   if (number_of_components == 3) {  // Color images.

-    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&

-        subsample_x[1] == 2 && subsample_y[1] == 2 &&

-        subsample_x[2] == 2 && subsample_y[2] == 2) {

+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&

+        subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {

       return kJpegYuv420;

-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&

-        subsample_x[1] == 2 && subsample_y[1] == 1 &&

-        subsample_x[2] == 2 && subsample_y[2] == 1) {

+    }

+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&

+        subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {

       return kJpegYuv422;

-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&

-        subsample_x[1] == 1 && subsample_y[1] == 1 &&

-        subsample_x[2] == 1 && subsample_y[2] == 1) {

+    }

+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&

+        subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {

       return kJpegYuv444;

   } else if (number_of_components == 1) {  // Grey-scale images.

@@ -574,4 +571,3 @@

 }  // namespace libyuv

 #endif  // HAVE_JPEG

--- a/third_party/libyuv/source/mjpeg_validate.cc

+++ b/third_party/libyuv/source/mjpeg_validate.cc

@@ -18,13 +18,13 @@

 #endif

 // Helper function to scan for EOI marker (0xff 0xd9).

-static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {

+static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {

   if (sample_size >= 2) {

-    const uint8* end = sample + sample_size - 1;

-    const uint8* it = sample;

+    const uint8_t* end = sample + sample_size - 1;

+    const uint8_t* it = sample;

     while (it < end) {

       // TODO(fbarchard): scan for 0xd9 instead.

-      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));

+      it = (const uint8_t*)(memchr(it, 0xff, end - it));

       if (it == NULL) {

         break;

@@ -39,7 +39,7 @@

 // Helper function to validate the jpeg appears intact.

-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {

+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) {

   // Maximum size that ValidateJpeg will consider valid.

   const size_t kMaxJpegSize = 0x7fffffffull;

   const size_t kBackSearchSize = 1024;

@@ -68,4 +68,3 @@

 }  // extern "C"

 }  // namespace libyuv

 #endif

--- a/third_party/libyuv/source/planar_functions.cc

+++ b/third_party/libyuv/source/planar_functions.cc

@@ -26,11 +26,14 @@

 // Copy a plane of data

 LIBYUV_API

-void CopyPlane(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height) {

+void CopyPlane(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height) {

   int y;

-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -38,8 +41,7 @@

     dst_stride_y = -dst_stride_y;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      dst_stride_y == width) {

+  if (src_stride_y == width && dst_stride_y == width) {

     width *= height;

     height = 1;

     src_stride_y = dst_stride_y = 0;

@@ -48,6 +50,7 @@

   if (src_y == dst_y && src_stride_y == dst_stride_y) {

     return;

 #if defined(HAS_COPYROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;

@@ -68,11 +71,6 @@

     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;

 #endif

-#if defined(HAS_COPYROW_MIPS)

-  if (TestCpuFlag(kCpuHasMIPS)) {

-    CopyRow = CopyRow_MIPS;

-  }

-#endif

   // Copy plane

   for (y = 0; y < height; ++y) {

@@ -83,15 +81,18 @@

 // TODO(fbarchard): Consider support for negative height.

+// TODO(fbarchard): Consider stride measured in bytes.

 LIBYUV_API

-void CopyPlane_16(const uint16* src_y, int src_stride_y,

-                  uint16* dst_y, int dst_stride_y,

-                  int width, int height) {

+void CopyPlane_16(const uint16_t* src_y,

+                  int src_stride_y,

+                  uint16_t* dst_y,

+                  int dst_stride_y,

+                  int width,

+                  int height) {

   int y;

-  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;

+  void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      dst_stride_y == width) {

+  if (src_stride_y == width && dst_stride_y == width) {

     width *= height;

     height = 1;

     src_stride_y = dst_stride_y = 0;

@@ -111,11 +112,6 @@

     CopyRow = CopyRow_16_NEON;

 #endif

-#if defined(HAS_COPYROW_16_MIPS)

-  if (TestCpuFlag(kCpuHasMIPS)) {

-    CopyRow = CopyRow_16_MIPS;

-  }

-#endif

   // Copy plane

   for (y = 0; y < height; ++y) {

@@ -125,19 +121,124 @@

+// Convert a plane of 16 bit data to 8 bit

+LIBYUV_API

+void Convert16To8Plane(const uint16_t* src_y,

+                       int src_stride_y,

+                       uint8_t* dst_y,

+                       int dst_stride_y,

+                       int scale,  // 16384 for 10 bits

+                       int width,

+                       int height) {

+  int y;

+  void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,

+                          int width) = Convert16To8Row_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_stride_y = -dst_stride_y;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width && dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_y = 0;

+  }

+#if defined(HAS_CONVERT16TO8ROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    Convert16To8Row = Convert16To8Row_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      Convert16To8Row = Convert16To8Row_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_CONVERT16TO8ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    Convert16To8Row = Convert16To8Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      Convert16To8Row = Convert16To8Row_AVX2;

+    }

+  }

+#endif

+  // Convert plane

+  for (y = 0; y < height; ++y) {

+    Convert16To8Row(src_y, dst_y, scale, width);

+    src_y += src_stride_y;

+    dst_y += dst_stride_y;

+  }

+}

+// Convert a plane of 8 bit data to 16 bit

+LIBYUV_API

+void Convert8To16Plane(const uint8_t* src_y,

+                       int src_stride_y,

+                       uint16_t* dst_y,

+                       int dst_stride_y,

+                       int scale,  // 16384 for 10 bits

+                       int width,

+                       int height) {

+  int y;

+  void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,

+                          int width) = Convert8To16Row_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_y = dst_y + (height - 1) * dst_stride_y;

+    dst_stride_y = -dst_stride_y;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width && dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_y = 0;

+  }

+#if defined(HAS_CONVERT8TO16ROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    Convert8To16Row = Convert8To16Row_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      Convert8To16Row = Convert8To16Row_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_CONVERT8TO16ROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    Convert8To16Row = Convert8To16Row_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      Convert8To16Row = Convert8To16Row_AVX2;

+    }

+  }

+#endif

+  // Convert plane

+  for (y = 0; y < height; ++y) {

+    Convert8To16Row(src_y, dst_y, scale, width);

+    src_y += src_stride_y;

+    dst_y += dst_stride_y;

+  }

+}

 // Copy I422.

 LIBYUV_API

-int I422Copy(const uint8* src_y, int src_stride_y,

-             const uint8* src_u, int src_stride_u,

-             const uint8* src_v, int src_stride_v,

-             uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int width, int height) {

+int I422Copy(const uint8_t* src_y,

+             int src_stride_y,

+             const uint8_t* src_u,

+             int src_stride_u,

+             const uint8_t* src_v,

+             int src_stride_v,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height) {

   int halfwidth = (width + 1) >> 1;

-  if (!src_u || !src_v ||

-      !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -161,16 +262,21 @@

 // Copy I444.

 LIBYUV_API

-int I444Copy(const uint8* src_y, int src_stride_y,

-             const uint8* src_u, int src_stride_u,

-             const uint8* src_v, int src_stride_v,

-             uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int width, int height) {

-  if (!src_u || !src_v ||

-      !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+int I444Copy(const uint8_t* src_y,

+             int src_stride_y,

+             const uint8_t* src_u,

+             int src_stride_u,

+             const uint8_t* src_v,

+             int src_stride_v,

+             uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int width,

+             int height) {

+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -194,9 +300,12 @@

 // Copy I400.

 LIBYUV_API

-int I400ToI400(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height) {

+int I400ToI400(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height) {

   if (!src_y || !dst_y || width <= 0 || height == 0) {

     return -1;

@@ -212,11 +321,20 @@

 // Convert I420 to I400.

 LIBYUV_API

-int I420ToI400(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height) {

+int I420ToI400(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height) {

+  (void)src_u;

+  (void)src_stride_u;

+  (void)src_v;

+  (void)src_stride_v;

   if (!src_y || !dst_y || width <= 0 || height == 0) {

     return -1;

@@ -234,12 +352,16 @@

 // Support function for NV12 etc UV channels.

 // Width and height are plane sizes (typically half pixel width).

 LIBYUV_API

-void SplitUVPlane(const uint8* src_uv, int src_stride_uv,

-                  uint8* dst_u, int dst_stride_u,

-                  uint8* dst_v, int dst_stride_v,

-                  int width, int height) {

+void SplitUVPlane(const uint8_t* src_uv,

+                  int src_stride_uv,

+                  uint8_t* dst_u,

+                  int dst_stride_u,

+                  uint8_t* dst_v,

+                  int dst_stride_v,

+                  int width,

+                  int height) {

   int y;

-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,

                      int width) = SplitUVRow_C;

   // Negative height means invert the image.

   if (height < 0) {

@@ -250,8 +372,7 @@

     dst_stride_v = -dst_stride_v;

   // Coalesce rows.

-  if (src_stride_uv == width * 2 &&

-      dst_stride_u == width &&

+  if (src_stride_uv == width * 2 && dst_stride_u == width &&

       dst_stride_v == width) {

     width *= height;

     height = 1;

@@ -281,13 +402,11 @@

 #endif

-#if defined(HAS_SPLITUVROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&

-      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {

-    SplitUVRow = SplitUVRow_Any_DSPR2;

-    if (IS_ALIGNED(width, 16)) {

-      SplitUVRow = SplitUVRow_DSPR2;

+#if defined(HAS_SPLITUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SplitUVRow = SplitUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      SplitUVRow = SplitUVRow_MSA;

 #endif

@@ -302,13 +421,17 @@

 LIBYUV_API

-void MergeUVPlane(const uint8* src_u, int src_stride_u,

-                  const uint8* src_v, int src_stride_v,

-                  uint8* dst_uv, int dst_stride_uv,

-                  int width, int height) {

+void MergeUVPlane(const uint8_t* src_u,

+                  int src_stride_u,

+                  const uint8_t* src_v,

+                  int src_stride_v,

+                  uint8_t* dst_uv,

+                  int dst_stride_uv,

+                  int width,

+                  int height) {

   int y;

-  void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

-      int width) = MergeUVRow_C;

+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,

+                     uint8_t* dst_uv, int width) = MergeUVRow_C;

   // Coalesce rows.

   // Negative height means invert the image.

   if (height < 0) {

@@ -317,8 +440,7 @@

     dst_stride_uv = -dst_stride_uv;

   // Coalesce rows.

-  if (src_stride_u == width &&

-      src_stride_v == width &&

+  if (src_stride_u == width && src_stride_v == width &&

       dst_stride_uv == width * 2) {

     width *= height;

     height = 1;

@@ -348,6 +470,14 @@

 #endif

+#if defined(HAS_MERGEUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    MergeUVRow = MergeUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      MergeUVRow = MergeUVRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     // Merge a row of U and V into a row of UV.

@@ -358,12 +488,131 @@

+// Support function for NV12 etc RGB channels.

+// Width and height are plane sizes (typically half pixel width).

+LIBYUV_API

+void SplitRGBPlane(const uint8_t* src_rgb,

+                   int src_stride_rgb,

+                   uint8_t* dst_r,

+                   int dst_stride_r,

+                   uint8_t* dst_g,

+                   int dst_stride_g,

+                   uint8_t* dst_b,

+                   int dst_stride_b,

+                   int width,

+                   int height) {

+  int y;

+  void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,

+                      uint8_t* dst_b, int width) = SplitRGBRow_C;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_r = dst_r + (height - 1) * dst_stride_r;

+    dst_g = dst_g + (height - 1) * dst_stride_g;

+    dst_b = dst_b + (height - 1) * dst_stride_b;

+    dst_stride_r = -dst_stride_r;

+    dst_stride_g = -dst_stride_g;

+    dst_stride_b = -dst_stride_b;

+  }

+  // Coalesce rows.

+  if (src_stride_rgb == width * 3 && dst_stride_r == width &&

+      dst_stride_g == width && dst_stride_b == width) {

+    width *= height;

+    height = 1;

+    src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;

+  }

+#if defined(HAS_SPLITRGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    SplitRGBRow = SplitRGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      SplitRGBRow = SplitRGBRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_SPLITRGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    SplitRGBRow = SplitRGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      SplitRGBRow = SplitRGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    // Copy a row of RGB.

+    SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);

+    dst_r += dst_stride_r;

+    dst_g += dst_stride_g;

+    dst_b += dst_stride_b;

+    src_rgb += src_stride_rgb;

+  }

+}

+LIBYUV_API

+void MergeRGBPlane(const uint8_t* src_r,

+                   int src_stride_r,

+                   const uint8_t* src_g,

+                   int src_stride_g,

+                   const uint8_t* src_b,

+                   int src_stride_b,

+                   uint8_t* dst_rgb,

+                   int dst_stride_rgb,

+                   int width,

+                   int height) {

+  int y;

+  void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,

+                      const uint8_t* src_b, uint8_t* dst_rgb, int width) =

+      MergeRGBRow_C;

+  // Coalesce rows.

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;

+    dst_stride_rgb = -dst_stride_rgb;

+  }

+  // Coalesce rows.

+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&

+      dst_stride_rgb == width * 3) {

+    width *= height;

+    height = 1;

+    src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;

+  }

+#if defined(HAS_MERGERGBROW_SSSE3)

+  if (TestCpuFlag(kCpuHasSSSE3)) {

+    MergeRGBRow = MergeRGBRow_Any_SSSE3;

+    if (IS_ALIGNED(width, 16)) {

+      MergeRGBRow = MergeRGBRow_SSSE3;

+    }

+  }

+#endif

+#if defined(HAS_MERGERGBROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    MergeRGBRow = MergeRGBRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      MergeRGBRow = MergeRGBRow_NEON;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    // Merge a row of U and V into a row of RGB.

+    MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);

+    src_r += src_stride_r;

+    src_g += src_stride_g;

+    src_b += src_stride_b;

+    dst_rgb += dst_stride_rgb;

+  }

+}

 // Mirror a plane of data.

-void MirrorPlane(const uint8* src_y, int src_stride_y,

-                 uint8* dst_y, int dst_stride_y,

-                 int width, int height) {

+void MirrorPlane(const uint8_t* src_y,

+                 int src_stride_y,

+                 uint8_t* dst_y,

+                 int dst_stride_y,

+                 int width,

+                 int height) {

   int y;

-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;

+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -394,12 +643,12 @@

 #endif

-// TODO(fbarchard): Mirror on mips handle unaligned memory.

-#if defined(HAS_MIRRORROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {

-    MirrorRow = MirrorRow_DSPR2;

+#if defined(HAS_MIRRORROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    MirrorRow = MirrorRow_Any_MSA;

+    if (IS_ALIGNED(width, 64)) {

+      MirrorRow = MirrorRow_MSA;

+    }

 #endif

@@ -413,17 +662,24 @@

 // Convert YUY2 to I422.

 LIBYUV_API

-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int YUY2ToI422(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width) =

-      YUY2ToUV422Row_C;

-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =

+  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,

+                         uint8_t* dst_v, int width) = YUY2ToUV422Row_C;

+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =

       YUY2ToYRow_C;

+  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -431,10 +687,9 @@

     src_stride_yuy2 = -src_stride_yuy2;

   // Coalesce rows.

-  if (src_stride_yuy2 == width * 2 &&

-      dst_stride_y == width &&

-      dst_stride_u * 2 == width &&

-      dst_stride_v * 2 == width) {

+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&

+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&

+      width * height <= 32768) {

     width *= height;

     height = 1;

     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;

@@ -462,9 +717,7 @@

 #if defined(HAS_YUY2TOYROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     YUY2ToYRow = YUY2ToYRow_Any_NEON;

-    if (width >= 16) {

-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;

-    }

+    YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;

     if (IS_ALIGNED(width, 16)) {

       YUY2ToYRow = YUY2ToYRow_NEON;

       YUY2ToUV422Row = YUY2ToUV422Row_NEON;

@@ -471,6 +724,16 @@

 #endif

+#if defined(HAS_YUY2TOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    YUY2ToYRow = YUY2ToYRow_Any_MSA;

+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      YUY2ToYRow = YUY2ToYRow_MSA;

+      YUY2ToUV422Row = YUY2ToUV422Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);

@@ -485,17 +748,24 @@

 // Convert UYVY to I422.

 LIBYUV_API

-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int UYVYToI422(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int y;

-  void (*UYVYToUV422Row)(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width) =

-      UYVYToUV422Row_C;

-  void (*UYVYToYRow)(const uint8* src_uyvy,

-                     uint8* dst_y, int width) = UYVYToYRow_C;

+  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,

+                         uint8_t* dst_v, int width) = UYVYToUV422Row_C;

+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =

+      UYVYToYRow_C;

+  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

+    return -1;

+  }

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

@@ -503,10 +773,9 @@

     src_stride_uyvy = -src_stride_uyvy;

   // Coalesce rows.

-  if (src_stride_uyvy == width * 2 &&

-      dst_stride_y == width &&

-      dst_stride_u * 2 == width &&

-      dst_stride_v * 2 == width) {

+  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&

+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&

+      width * height <= 32768) {

     width *= height;

     height = 1;

     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;

@@ -534,9 +803,7 @@

 #if defined(HAS_UYVYTOYROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     UYVYToYRow = UYVYToYRow_Any_NEON;

-    if (width >= 16) {

-      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;

-    }

+    UYVYToUV422Row = UYVYToUV422Row_Any_NEON;

     if (IS_ALIGNED(width, 16)) {

       UYVYToYRow = UYVYToYRow_NEON;

       UYVYToUV422Row = UYVYToUV422Row_NEON;

@@ -543,6 +810,16 @@

 #endif

+#if defined(HAS_UYVYTOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    UYVYToYRow = UYVYToYRow_Any_MSA;

+    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      UYVYToYRow = UYVYToYRow_MSA;

+      UYVYToUV422Row = UYVYToUV422Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);

@@ -555,13 +832,82 @@

   return 0;

+// Convert YUY2 to Y.

+LIBYUV_API

+int YUY2ToY(const uint8_t* src_yuy2,

+            int src_stride_yuy2,

+            uint8_t* dst_y,

+            int dst_stride_y,

+            int width,

+            int height) {

+  int y;

+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =

+      YUY2ToYRow_C;

+  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {

+    return -1;

+  }

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;

+    src_stride_yuy2 = -src_stride_yuy2;

+  }

+  // Coalesce rows.

+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_yuy2 = dst_stride_y = 0;

+  }

+#if defined(HAS_YUY2TOYROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToYRow = YUY2ToYRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;

+    if (IS_ALIGNED(width, 32)) {

+      YUY2ToYRow = YUY2ToYRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    YUY2ToYRow = YUY2ToYRow_Any_NEON;

+    if (IS_ALIGNED(width, 16)) {

+      YUY2ToYRow = YUY2ToYRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_YUY2TOYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    YUY2ToYRow = YUY2ToYRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      YUY2ToYRow = YUY2ToYRow_MSA;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    YUY2ToYRow(src_yuy2, dst_y, width);

+    src_yuy2 += src_stride_yuy2;

+    dst_y += dst_stride_y;

+  }

+  return 0;

+}

 // Mirror I400 with optional flipping

 LIBYUV_API

-int I400Mirror(const uint8* src_y, int src_stride_y,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height) {

-  if (!src_y || !dst_y ||

-      width <= 0 || height == 0) {

+int I400Mirror(const uint8_t* src_y,

+               int src_stride_y,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height) {

+  if (!src_y || !dst_y || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -577,17 +923,24 @@

 // Mirror I420 with optional flipping

 LIBYUV_API

-int I420Mirror(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height) {

+int I420Mirror(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||

+      height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -612,11 +965,14 @@

 // ARGB mirror.

 LIBYUV_API

-int ARGBMirror(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int ARGBMirror(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =

+  void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =

       ARGBMirrorRow_C;

   if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -651,6 +1007,14 @@

 #endif

+#if defined(HAS_ARGBMIRRORROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBMirrorRow = ARGBMirrorRow_MSA;

+    }

+  }

+#endif

   // Mirror plane

   for (y = 0; y < height; ++y) {

@@ -666,8 +1030,8 @@

 // the same blend function for all pixels if possible.

 LIBYUV_API

 ARGBBlendRow GetARGBBlend() {

-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,

-                       uint8* dst_argb, int width) = ARGBBlendRow_C;

+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,

+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;

 #if defined(HAS_ARGBBLENDROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     ARGBBlendRow = ARGBBlendRow_SSSE3;

@@ -679,18 +1043,27 @@

     ARGBBlendRow = ARGBBlendRow_NEON;

 #endif

+#if defined(HAS_ARGBBLENDROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBBlendRow = ARGBBlendRow_MSA;

+  }

+#endif

   return ARGBBlendRow;

 // Alpha Blend 2 ARGB images and store to destination.

 LIBYUV_API

-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,

-              const uint8* src_argb1, int src_stride_argb1,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height) {

+int ARGBBlend(const uint8_t* src_argb0,

+              int src_stride_argb0,

+              const uint8_t* src_argb1,

+              int src_stride_argb1,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height) {

   int y;

-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,

-                       uint8* dst_argb, int width) = GetARGBBlend();

+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,

+                       uint8_t* dst_argb, int width) = GetARGBBlend();

   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -701,8 +1074,7 @@

     dst_stride_argb = -dst_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb0 == width * 4 &&

-      src_stride_argb1 == width * 4 &&

+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&

       dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

@@ -720,14 +1092,20 @@

 // Alpha Blend plane and store to destination.

 LIBYUV_API

-int BlendPlane(const uint8* src_y0, int src_stride_y0,

-               const uint8* src_y1, int src_stride_y1,

-               const uint8* alpha, int alpha_stride,

-               uint8* dst_y, int dst_stride_y,

-               int width, int height) {

+int BlendPlane(const uint8_t* src_y0,

+               int src_stride_y0,

+               const uint8_t* src_y1,

+               int src_stride_y1,

+               const uint8_t* alpha,

+               int alpha_stride,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               int width,

+               int height) {

   int y;

-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,

-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;

+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,

+                        const uint8_t* alpha, uint8_t* dst, int width) =

+      BlendPlaneRow_C;

   if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {

     return -1;

@@ -739,10 +1117,8 @@

   // Coalesce rows for Y plane.

-  if (src_stride_y0 == width &&

-      src_stride_y1 == width &&

-      alpha_stride == width &&

-      dst_stride_y == width) {

+  if (src_stride_y0 == width && src_stride_y1 == width &&

+      alpha_stride == width && dst_stride_y == width) {

     width *= height;

     height = 1;

     src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;

@@ -750,7 +1126,7 @@

 #if defined(HAS_BLENDPLANEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;

+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;

     if (IS_ALIGNED(width, 8)) {

       BlendPlaneRow = BlendPlaneRow_SSSE3;

@@ -758,7 +1134,7 @@

 #endif

 #if defined(HAS_BLENDPLANEROW_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-  BlendPlaneRow = BlendPlaneRow_Any_AVX2;

+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;

     if (IS_ALIGNED(width, 32)) {

       BlendPlaneRow = BlendPlaneRow_AVX2;

@@ -778,24 +1154,36 @@

 #define MAXTWIDTH 2048

 // Alpha Blend YUV images and store to destination.

 LIBYUV_API

-int I420Blend(const uint8* src_y0, int src_stride_y0,

-              const uint8* src_u0, int src_stride_u0,

-              const uint8* src_v0, int src_stride_v0,

-              const uint8* src_y1, int src_stride_y1,

-              const uint8* src_u1, int src_stride_u1,

-              const uint8* src_v1, int src_stride_v1,

-              const uint8* alpha, int alpha_stride,

-              uint8* dst_y, int dst_stride_y,

-              uint8* dst_u, int dst_stride_u,

-              uint8* dst_v, int dst_stride_v,

-              int width, int height) {

+int I420Blend(const uint8_t* src_y0,

+              int src_stride_y0,

+              const uint8_t* src_u0,

+              int src_stride_u0,

+              const uint8_t* src_v0,

+              int src_stride_v0,

+              const uint8_t* src_y1,

+              int src_stride_y1,

+              const uint8_t* src_u1,

+              int src_stride_u1,

+              const uint8_t* src_v1,

+              int src_stride_v1,

+              const uint8_t* alpha,

+              int alpha_stride,

+              uint8_t* dst_y,

+              int dst_stride_y,

+              uint8_t* dst_u,

+              int dst_stride_u,

+              uint8_t* dst_v,

+              int dst_stride_v,

+              int width,

+              int height) {

   int y;

   // Half width/height for UV.

   int halfwidth = (width + 1) >> 1;

-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,

-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;

-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;

+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,

+                        const uint8_t* alpha, uint8_t* dst, int width) =

+      BlendPlaneRow_C;

+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,

+                        uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;

   if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||

       !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

@@ -809,11 +1197,8 @@

   // Blend Y plane.

-  BlendPlane(src_y0, src_stride_y0,

-             src_y1, src_stride_y1,

-             alpha, alpha_stride,

-             dst_y, dst_stride_y,

-             width, height);

+  BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,

+             dst_y, dst_stride_y, width, height);

 #if defined(HAS_BLENDPLANEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

@@ -893,13 +1278,17 @@

 // Multiply 2 ARGB images and store to destination.

 LIBYUV_API

-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,

-                 const uint8* src_argb1, int src_stride_argb1,

-                 uint8* dst_argb, int dst_stride_argb,

-                 int width, int height) {

+int ARGBMultiply(const uint8_t* src_argb0,

+                 int src_stride_argb0,

+                 const uint8_t* src_argb1,

+                 int src_stride_argb1,

+                 uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int width,

+                 int height) {

   int y;

-  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,

-                          int width) = ARGBMultiplyRow_C;

+  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,

+                          uint8_t* dst, int width) = ARGBMultiplyRow_C;

   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -910,8 +1299,7 @@

     dst_stride_argb = -dst_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb0 == width * 4 &&

-      src_stride_argb1 == width * 4 &&

+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&

       dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

@@ -941,6 +1329,14 @@

 #endif

+#if defined(HAS_ARGBMULTIPLYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBMultiplyRow = ARGBMultiplyRow_MSA;

+    }

+  }

+#endif

   // Multiply plane

   for (y = 0; y < height; ++y) {

@@ -954,12 +1350,16 @@

 // Add 2 ARGB images and store to destination.

 LIBYUV_API

-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,

-            const uint8* src_argb1, int src_stride_argb1,

-            uint8* dst_argb, int dst_stride_argb,

-            int width, int height) {

+int ARGBAdd(const uint8_t* src_argb0,

+            int src_stride_argb0,

+            const uint8_t* src_argb1,

+            int src_stride_argb1,

+            uint8_t* dst_argb,

+            int dst_stride_argb,

+            int width,

+            int height) {

   int y;

-  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,

+  void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,

                      int width) = ARGBAddRow_C;

   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -971,8 +1371,7 @@

     dst_stride_argb = -dst_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb0 == width * 4 &&

-      src_stride_argb1 == width * 4 &&

+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&

       dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

@@ -1007,6 +1406,14 @@

 #endif

+#if defined(HAS_ARGBADDROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBAddRow = ARGBAddRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAddRow = ARGBAddRow_MSA;

+    }

+  }

+#endif

   // Add plane

   for (y = 0; y < height; ++y) {

@@ -1020,13 +1427,17 @@

 // Subtract 2 ARGB images and store to destination.

 LIBYUV_API

-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,

-                 const uint8* src_argb1, int src_stride_argb1,

-                 uint8* dst_argb, int dst_stride_argb,

-                 int width, int height) {

+int ARGBSubtract(const uint8_t* src_argb0,

+                 int src_stride_argb0,

+                 const uint8_t* src_argb1,

+                 int src_stride_argb1,

+                 uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int width,

+                 int height) {

   int y;

-  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,

-                          int width) = ARGBSubtractRow_C;

+  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,

+                          uint8_t* dst, int width) = ARGBSubtractRow_C;

   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -1037,8 +1448,7 @@

     dst_stride_argb = -dst_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb0 == width * 4 &&

-      src_stride_argb1 == width * 4 &&

+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&

       dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

@@ -1068,6 +1478,14 @@

 #endif

+#if defined(HAS_ARGBSUBTRACTROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBSubtractRow = ARGBSubtractRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBSubtractRow = ARGBSubtractRow_MSA;

+    }

+  }

+#endif

   // Subtract plane

   for (y = 0; y < height; ++y) {

@@ -1079,21 +1497,23 @@

   return 0;

 // Convert I422 to RGBA with matrix

-static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,

-                            const uint8* src_u, int src_stride_u,

-                            const uint8* src_v, int src_stride_v,

-                            uint8* dst_rgba, int dst_stride_rgba,

+static int I422ToRGBAMatrix(const uint8_t* src_y,

+                            int src_stride_y,

+                            const uint8_t* src_u,

+                            int src_stride_u,

+                            const uint8_t* src_v,

+                            int src_stride_v,

+                            uint8_t* dst_rgba,

+                            int dst_stride_rgba,

                             const struct YuvConstants* yuvconstants,

-                            int width, int height) {

+                            int width,

+                            int height) {

   int y;

-  void (*I422ToRGBARow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        const struct YuvConstants* yuvconstants,

-                        int width) = I422ToRGBARow_C;

-  if (!src_y || !src_u || !src_v || !dst_rgba ||

-      width <= 0 || height == 0) {

+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf,

+                        const struct YuvConstants* yuvconstants, int width) =

+      I422ToRGBARow_C;

+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1126,13 +1546,12 @@

 #endif

-#if defined(HAS_I422TORGBAROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {

-    I422ToRGBARow = I422ToRGBARow_DSPR2;

+#if defined(HAS_I422TORGBAROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToRGBARow = I422ToRGBARow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      I422ToRGBARow = I422ToRGBARow_MSA;

+    }

 #endif

@@ -1148,30 +1567,36 @@

 // Convert I422 to RGBA.

 LIBYUV_API

-int I422ToRGBA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_rgba, int dst_stride_rgba,

-               int width, int height) {

-  return I422ToRGBAMatrix(src_y, src_stride_y,

-                          src_u, src_stride_u,

-                          src_v, src_stride_v,

-                          dst_rgba, dst_stride_rgba,

-                          &kYuvI601Constants,

-                          width, height);

+int I422ToRGBA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_rgba,

+               int dst_stride_rgba,

+               int width,

+               int height) {

+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                          src_stride_v, dst_rgba, dst_stride_rgba,

+                          &kYuvI601Constants, width, height);

 // Convert I422 to BGRA.

 LIBYUV_API

-int I422ToBGRA(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_bgra, int dst_stride_bgra,

-               int width, int height) {

-  return I422ToRGBAMatrix(src_y, src_stride_y,

-                          src_v, src_stride_v,  // Swap U and V

-                          src_u, src_stride_u,

-                          dst_bgra, dst_stride_bgra,

+int I422ToBGRA(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_bgra,

+               int dst_stride_bgra,

+               int width,

+               int height) {

+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,

+                          src_stride_v,  // Swap U and V

+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,

                           &kYvuI601Constants,  // Use Yvu matrix

                           width, height);

@@ -1178,18 +1603,19 @@

 // Convert NV12 to RGB565.

 LIBYUV_API

-int NV12ToRGB565(const uint8* src_y, int src_stride_y,

-                 const uint8* src_uv, int src_stride_uv,

-                 uint8* dst_rgb565, int dst_stride_rgb565,

-                 int width, int height) {

+int NV12ToRGB565(const uint8_t* src_y,

+                 int src_stride_y,

+                 const uint8_t* src_uv,

+                 int src_stride_uv,

+                 uint8_t* dst_rgb565,

+                 int dst_stride_rgb565,

+                 int width,

+                 int height) {

   int y;

-  void (*NV12ToRGB565Row)(const uint8* y_buf,

-                          const uint8* uv_buf,

-                          uint8* rgb_buf,

-                          const struct YuvConstants* yuvconstants,

-                          int width) = NV12ToRGB565Row_C;

-  if (!src_y || !src_uv || !dst_rgb565 ||

-      width <= 0 || height == 0) {

+  void (*NV12ToRGB565Row)(

+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,

+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;

+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1222,6 +1648,14 @@

 #endif

+#if defined(HAS_NV12TORGB565ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);

@@ -1236,14 +1670,16 @@

 // Convert RAW to RGB24.

 LIBYUV_API

-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,

-               uint8* dst_rgb24, int dst_stride_rgb24,

-               int width, int height) {

+int RAWToRGB24(const uint8_t* src_raw,

+               int src_stride_raw,

+               uint8_t* dst_rgb24,

+               int dst_stride_rgb24,

+               int width,

+               int height) {

   int y;

-  void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =

+  void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =

       RAWToRGB24Row_C;

-  if (!src_raw || !dst_rgb24 ||

-      width <= 0 || height == 0) {

+  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -1253,8 +1689,7 @@

     src_stride_raw = -src_stride_raw;

   // Coalesce rows.

-  if (src_stride_raw == width * 3 &&

-      dst_stride_rgb24 == width * 3) {

+  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {

     width *= height;

     height = 1;

     src_stride_raw = dst_stride_rgb24 = 0;

@@ -1275,6 +1710,14 @@

 #endif

+#if defined(HAS_RAWTORGB24ROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      RAWToRGB24Row = RAWToRGB24Row_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     RAWToRGB24Row(src_raw, dst_rgb24, width);

@@ -1285,11 +1728,13 @@

 LIBYUV_API

-void SetPlane(uint8* dst_y, int dst_stride_y,

-              int width, int height,

-              uint32 value) {

+void SetPlane(uint8_t* dst_y,

+              int dst_stride_y,

+              int width,

+              int height,

+              uint32_t value) {

   int y;

-  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;

+  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;

   if (height < 0) {

     height = -height;

     dst_y = dst_y + (height - 1) * dst_stride_y;

@@ -1322,6 +1767,11 @@

     SetRow = SetRow_ERMS;

 #endif

+#if defined(HAS_SETROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {

+    SetRow = SetRow_MSA;

+  }

+#endif

   // Set plane

   for (y = 0; y < height; ++y) {

@@ -1332,22 +1782,26 @@

 // Draw a rectangle into I420

 LIBYUV_API

-int I420Rect(uint8* dst_y, int dst_stride_y,

-             uint8* dst_u, int dst_stride_u,

-             uint8* dst_v, int dst_stride_v,

-             int x, int y,

-             int width, int height,

-             int value_y, int value_u, int value_v) {

+int I420Rect(uint8_t* dst_y,

+             int dst_stride_y,

+             uint8_t* dst_u,

+             int dst_stride_u,

+             uint8_t* dst_v,

+             int dst_stride_v,

+             int x,

+             int y,

+             int width,

+             int height,

+             int value_y,

+             int value_u,

+             int value_v) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  uint8* start_y = dst_y + y * dst_stride_y + x;

-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);

-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);

-  if (!dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0 ||

-      x < 0 || y < 0 ||

-      value_y < 0 || value_y > 255 ||

-      value_u < 0 || value_u > 255 ||

+  uint8_t* start_y = dst_y + y * dst_stride_y + x;

+  uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);

+  uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);

+  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||

+      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||

       value_v < 0 || value_v > 255) {

     return -1;

@@ -1360,15 +1814,17 @@

 // Draw a rectangle into ARGB

 LIBYUV_API

-int ARGBRect(uint8* dst_argb, int dst_stride_argb,

-             int dst_x, int dst_y,

-             int width, int height,

-             uint32 value) {

+int ARGBRect(uint8_t* dst_argb,

+             int dst_stride_argb,

+             int dst_x,

+             int dst_y,

+             int width,

+             int height,

+             uint32_t value) {

   int y;

-  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;

-  if (!dst_argb ||

-      width <= 0 || height == 0 ||

-      dst_x < 0 || dst_y < 0) {

+  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =

+      ARGBSetRow_C;

+  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {

     return -1;

   if (height < 0) {

@@ -1397,6 +1853,14 @@

     ARGBSetRow = ARGBSetRow_X86;

 #endif

+#if defined(HAS_ARGBSETROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBSetRow = ARGBSetRow_Any_MSA;

+    if (IS_ALIGNED(width, 4)) {

+      ARGBSetRow = ARGBSetRow_MSA;

+    }

+  }

+#endif

   // Set plane

   for (y = 0; y < height; ++y) {

@@ -1420,11 +1884,14 @@

 //   f is foreground pixel premultiplied by alpha

 LIBYUV_API

-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,

-                  uint8* dst_argb, int dst_stride_argb,

-                  int width, int height) {

+int ARGBAttenuate(const uint8_t* src_argb,

+                  int src_stride_argb,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int width,

+                  int height) {

   int y;

-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,

+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,

                            int width) = ARGBAttenuateRow_C;

   if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -1435,8 +1902,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -1465,6 +1931,14 @@

 #endif

+#if defined(HAS_ARGBATTENUATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBAttenuateRow(src_argb, dst_argb, width);

@@ -1476,11 +1950,14 @@

 // Convert preattentuated ARGB to unattenuated ARGB.

 LIBYUV_API

-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,

-                    uint8* dst_argb, int dst_stride_argb,

-                    int width, int height) {

+int ARGBUnattenuate(const uint8_t* src_argb,

+                    int src_stride_argb,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    int width,

+                    int height) {

   int y;

-  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,

+  void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,

                              int width) = ARGBUnattenuateRow_C;

   if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -1491,8 +1968,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -1513,7 +1989,7 @@

 #endif

-// TODO(fbarchard): Neon version.

+  // TODO(fbarchard): Neon version.

   for (y = 0; y < height; ++y) {

     ARGBUnattenuateRow(src_argb, dst_argb, width);

@@ -1525,12 +2001,15 @@

 // Convert ARGB to Grayed ARGB.

 LIBYUV_API

-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_argb, int dst_stride_argb,

-               int width, int height) {

+int ARGBGrayTo(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height) {

   int y;

-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,

-                      int width) = ARGBGrayRow_C;

+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =

+      ARGBGrayRow_C;

   if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -1540,8 +2019,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -1556,6 +2034,11 @@

     ARGBGrayRow = ARGBGrayRow_NEON;

 #endif

+#if defined(HAS_ARGBGRAYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {

+    ARGBGrayRow = ARGBGrayRow_MSA;

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBGrayRow(src_argb, dst_argb, width);

@@ -1567,13 +2050,16 @@

 // Make a rectangle of ARGB gray scale.

 LIBYUV_API

-int ARGBGray(uint8* dst_argb, int dst_stride_argb,

-             int dst_x, int dst_y,

-             int width, int height) {

+int ARGBGray(uint8_t* dst_argb,

+             int dst_stride_argb,

+             int dst_x,

+             int dst_y,

+             int width,

+             int height) {

   int y;

-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,

-                      int width) = ARGBGrayRow_C;

-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =

+      ARGBGrayRow_C;

+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {

     return -1;

@@ -1593,6 +2079,12 @@

     ARGBGrayRow = ARGBGrayRow_NEON;

 #endif

+#if defined(HAS_ARGBGRAYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {

+    ARGBGrayRow = ARGBGrayRow_MSA;

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBGrayRow(dst, dst, width);

     dst += dst_stride_argb;

@@ -1602,11 +2094,15 @@

 // Make a rectangle of ARGB Sepia tone.

 LIBYUV_API

-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,

-              int dst_x, int dst_y, int width, int height) {

+int ARGBSepia(uint8_t* dst_argb,

+              int dst_stride_argb,

+              int dst_x,

+              int dst_y,

+              int width,

+              int height) {

   int y;

-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;

-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;

+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {

     return -1;

@@ -1626,6 +2122,12 @@

     ARGBSepiaRow = ARGBSepiaRow_NEON;

 #endif

+#if defined(HAS_ARGBSEPIAROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {

+    ARGBSepiaRow = ARGBSepiaRow_MSA;

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBSepiaRow(dst, width);

     dst += dst_stride_argb;

@@ -1636,13 +2138,17 @@

 // Apply a 4x4 matrix to each ARGB pixel.

 // Note: Normally for shading, but can be used to swizzle or invert.

 LIBYUV_API

-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,

-                    uint8* dst_argb, int dst_stride_argb,

-                    const int8* matrix_argb,

-                    int width, int height) {

+int ARGBColorMatrix(const uint8_t* src_argb,

+                    int src_stride_argb,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    const int8_t* matrix_argb,

+                    int width,

+                    int height) {

   int y;

-  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,

-      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;

+  void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,

+                             const int8_t* matrix_argb, int width) =

+      ARGBColorMatrixRow_C;

   if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {

     return -1;

@@ -1652,8 +2158,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -1668,6 +2173,11 @@

     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;

 #endif

+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {

+    ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);

     src_argb += src_stride_argb;

@@ -1679,13 +2189,17 @@

 // Apply a 4x3 matrix to each ARGB pixel.

 // Deprecated.

 LIBYUV_API

-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,

-                   const int8* matrix_rgb,

-                   int dst_x, int dst_y, int width, int height) {

-  SIMD_ALIGNED(int8 matrix_argb[16]);

-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||

-      dst_x < 0 || dst_y < 0) {

+int RGBColorMatrix(uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   const int8_t* matrix_rgb,

+                   int dst_x,

+                   int dst_y,

+                   int width,

+                   int height) {

+  SIMD_ALIGNED(int8_t matrix_argb[16]);

+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||

+      dst_y < 0) {

     return -1;

@@ -1705,23 +2219,26 @@

   matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;

   matrix_argb[15] = 64;  // 1.0

-  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,

-                         dst, dst_stride_argb,

-                         &matrix_argb[0], width, height);

+  return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,

+                         dst_stride_argb, &matrix_argb[0], width, height);

 // Apply a color table each ARGB pixel.

 // Table contains 256 ARGB values.

 LIBYUV_API

-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,

-                   const uint8* table_argb,

-                   int dst_x, int dst_y, int width, int height) {

+int ARGBColorTable(uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   const uint8_t* table_argb,

+                   int dst_x,

+                   int dst_y,

+                   int width,

+                   int height) {

   int y;

-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,

+  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,

                             int width) = ARGBColorTableRow_C;

-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||

-      dst_x < 0 || dst_y < 0) {

+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||

+      dst_y < 0) {

     return -1;

   // Coalesce rows.

@@ -1745,15 +2262,19 @@

 // Apply a color table each ARGB pixel but preserve destination alpha.

 // Table contains 256 ARGB values.

 LIBYUV_API

-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,

-                  const uint8* table_argb,

-                  int dst_x, int dst_y, int width, int height) {

+int RGBColorTable(uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  const uint8_t* table_argb,

+                  int dst_x,

+                  int dst_y,

+                  int width,

+                  int height) {

   int y;

-  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,

+  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,

                            int width) = RGBColorTableRow_C;

-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||

-      dst_x < 0 || dst_y < 0) {

+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||

+      dst_y < 0) {

     return -1;

   // Coalesce rows.

@@ -1784,13 +2305,19 @@

 // Caveat - although SSE2 saturates, the C function does not and should be used

 // with care if doing anything but quantization.

 LIBYUV_API

-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,

-                 int scale, int interval_size, int interval_offset,

-                 int dst_x, int dst_y, int width, int height) {

+int ARGBQuantize(uint8_t* dst_argb,

+                 int dst_stride_argb,

+                 int scale,

+                 int interval_size,

+                 int interval_offset,

+                 int dst_x,

+                 int dst_y,

+                 int width,

+                 int height) {

   int y;

-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,

+  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,

                           int interval_offset, int width) = ARGBQuantizeRow_C;

-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;

   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||

       interval_size < 1 || interval_size > 255) {

     return -1;

@@ -1811,6 +2338,11 @@

     ARGBQuantizeRow = ARGBQuantizeRow_NEON;

 #endif

+#if defined(HAS_ARGBQUANTIZEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {

+    ARGBQuantizeRow = ARGBQuantizeRow_MSA;

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);

     dst += dst_stride_argb;

@@ -1821,13 +2353,17 @@

 // Computes table of cumulative sum for image where the value is the sum

 // of all values above and to the left of the entry. Used by ARGBBlur.

 LIBYUV_API

-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,

-                             int32* dst_cumsum, int dst_stride32_cumsum,

-                             int width, int height) {

+int ARGBComputeCumulativeSum(const uint8_t* src_argb,

+                             int src_stride_argb,

+                             int32_t* dst_cumsum,

+                             int dst_stride32_cumsum,

+                             int width,

+                             int height) {

   int y;

-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,

-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;

-  int32* previous_cumsum = dst_cumsum;

+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,

+                                  const int32_t* previous_cumsum, int width) =

+      ComputeCumulativeSumRow_C;

+  int32_t* previous_cumsum = dst_cumsum;

   if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {

     return -1;

@@ -1851,18 +2387,25 @@

 // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory

 // as the buffer is treated as circular.

 LIBYUV_API

-int ARGBBlur(const uint8* src_argb, int src_stride_argb,

-             uint8* dst_argb, int dst_stride_argb,

-             int32* dst_cumsum, int dst_stride32_cumsum,

-             int width, int height, int radius) {

+int ARGBBlur(const uint8_t* src_argb,

+             int src_stride_argb,

+             uint8_t* dst_argb,

+             int dst_stride_argb,

+             int32_t* dst_cumsum,

+             int dst_stride32_cumsum,

+             int width,

+             int height,

+             int radius) {

   int y;

-  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,

-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;

-  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,

-      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;

-  int32* cumsum_bot_row;

-  int32* max_cumsum_bot_row;

-  int32* cumsum_top_row;

+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,

+                                  const int32_t* previous_cumsum, int width) =

+      ComputeCumulativeSumRow_C;

+  void (*CumulativeSumToAverageRow)(

+      const int32_t* topleft, const int32_t* botleft, int width, int area,

+      uint8_t* dst, int count) = CumulativeSumToAverageRow_C;

+  int32_t* cumsum_bot_row;

+  int32_t* max_cumsum_bot_row;

+  int32_t* cumsum_top_row;

   if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -1889,9 +2432,8 @@

 #endif

   // Compute enough CumulativeSum for first row to be blurred. After this

   // one row of CumulativeSum is updated at a time.

-  ARGBComputeCumulativeSum(src_argb, src_stride_argb,

-                           dst_cumsum, dst_stride32_cumsum,

-                           width, radius);

+  ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,

+                           dst_stride32_cumsum, width, radius);

   src_argb = src_argb + radius * src_stride_argb;

   cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];

@@ -1917,7 +2459,7 @@

     // Increment cumsum_bot_row pointer with circular buffer wrap around and

     // then fill in a row of CumulativeSum.

     if ((y + radius) < height) {

-      const int32* prev_cumsum_bot_row = cumsum_bot_row;

+      const int32_t* prev_cumsum_bot_row = cumsum_bot_row;

       cumsum_bot_row += dst_stride32_cumsum;

       if (cumsum_bot_row >= max_cumsum_bot_row) {

         cumsum_bot_row = dst_cumsum;

@@ -1929,8 +2471,8 @@

     // Left clipped.

     for (x = 0; x < radius + 1; ++x) {

-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,

-                                boxwidth, area, &dst_argb[x * 4], 1);

+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,

+                                &dst_argb[x * 4], 1);

       area += (bot_y - top_y);

       boxwidth += 4;

@@ -1937,8 +2479,8 @@

     // Middle unclipped.

     n = (width - 1) - radius - x + 1;

-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,

-                              boxwidth, area, &dst_argb[x * 4], n);

+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,

+                              &dst_argb[x * 4], n);

     // Right clipped.

     for (x += n; x <= width - 1; ++x) {

@@ -1945,8 +2487,8 @@

       area -= (bot_y - top_y);

       boxwidth -= 4;

       CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,

-                                cumsum_bot_row + (x - radius - 1) * 4,

-                                boxwidth, area, &dst_argb[x * 4], 1);

+                                cumsum_bot_row + (x - radius - 1) * 4, boxwidth,

+                                area, &dst_argb[x * 4], 1);

     dst_argb += dst_stride_argb;

@@ -1955,12 +2497,16 @@

 // Multiply ARGB image by a specified ARGB value.

 LIBYUV_API

-int ARGBShade(const uint8* src_argb, int src_stride_argb,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height, uint32 value) {

+int ARGBShade(const uint8_t* src_argb,

+              int src_stride_argb,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height,

+              uint32_t value) {

   int y;

-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,

-                       int width, uint32 value) = ARGBShadeRow_C;

+  void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,

+                       uint32_t value) = ARGBShadeRow_C;

   if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {

     return -1;

@@ -1970,8 +2516,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -1986,6 +2531,11 @@

     ARGBShadeRow = ARGBShadeRow_NEON;

 #endif

+#if defined(HAS_ARGBSHADEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {

+    ARGBShadeRow = ARGBShadeRow_MSA;

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBShadeRow(src_argb, dst_argb, width, value);

@@ -1997,12 +2547,17 @@

 // Interpolate 2 planes by specified amount (0 to 255).

 LIBYUV_API

-int InterpolatePlane(const uint8* src0, int src_stride0,

-                     const uint8* src1, int src_stride1,

-                     uint8* dst, int dst_stride,

-                     int width, int height, int interpolation) {

+int InterpolatePlane(const uint8_t* src0,

+                     int src_stride0,

+                     const uint8_t* src1,

+                     int src_stride1,

+                     uint8_t* dst,

+                     int dst_stride,

+                     int width,

+                     int height,

+                     int interpolation) {

   int y;

-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,

                          ptrdiff_t src_stride, int dst_width,

                          int source_y_fraction) = InterpolateRow_C;

   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {

@@ -2015,9 +2570,7 @@

     dst_stride = -dst_stride;

   // Coalesce rows.

-  if (src_stride0 == width &&

-      src_stride1 == width &&

-      dst_stride == width) {

+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {

     width *= height;

     height = 1;

     src_stride0 = src_stride1 = dst_stride = 0;

@@ -2046,13 +2599,12 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&

-      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&

-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&

-      IS_ALIGNED(width, 4)) {

-    InterpolateRow = InterpolateRow_DSPR2;

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      InterpolateRow = InterpolateRow_MSA;

+    }

 #endif

@@ -2067,61 +2619,71 @@

 // Interpolate 2 ARGB images by specified amount (0 to 255).

 LIBYUV_API

-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,

-                    const uint8* src_argb1, int src_stride_argb1,

-                    uint8* dst_argb, int dst_stride_argb,

-                    int width, int height, int interpolation) {

-  return InterpolatePlane(src_argb0, src_stride_argb0,

-                          src_argb1, src_stride_argb1,

-                          dst_argb, dst_stride_argb,

+int ARGBInterpolate(const uint8_t* src_argb0,

+                    int src_stride_argb0,

+                    const uint8_t* src_argb1,

+                    int src_stride_argb1,

+                    uint8_t* dst_argb,

+                    int dst_stride_argb,

+                    int width,

+                    int height,

+                    int interpolation) {

+  return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,

+                          src_stride_argb1, dst_argb, dst_stride_argb,

                           width * 4, height, interpolation);

 // Interpolate 2 YUV images by specified amount (0 to 255).

 LIBYUV_API

-int I420Interpolate(const uint8* src0_y, int src0_stride_y,

-                    const uint8* src0_u, int src0_stride_u,

-                    const uint8* src0_v, int src0_stride_v,

-                    const uint8* src1_y, int src1_stride_y,

-                    const uint8* src1_u, int src1_stride_u,

-                    const uint8* src1_v, int src1_stride_v,

-                    uint8* dst_y, int dst_stride_y,

-                    uint8* dst_u, int dst_stride_u,

-                    uint8* dst_v, int dst_stride_v,

-                    int width, int height, int interpolation) {

+int I420Interpolate(const uint8_t* src0_y,

+                    int src0_stride_y,

+                    const uint8_t* src0_u,

+                    int src0_stride_u,

+                    const uint8_t* src0_v,

+                    int src0_stride_v,

+                    const uint8_t* src1_y,

+                    int src1_stride_y,

+                    const uint8_t* src1_u,

+                    int src1_stride_u,

+                    const uint8_t* src1_v,

+                    int src1_stride_v,

+                    uint8_t* dst_y,

+                    int dst_stride_y,

+                    uint8_t* dst_u,

+                    int dst_stride_u,

+                    uint8_t* dst_v,

+                    int dst_stride_v,

+                    int width,

+                    int height,

+                    int interpolation) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!src0_y || !src0_u || !src0_v ||

-      !src1_y || !src1_u || !src1_v ||

-      !dst_y || !dst_u || !dst_v ||

-      width <= 0 || height == 0) {

+  if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||

+      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {

     return -1;

-  InterpolatePlane(src0_y, src0_stride_y,

-                   src1_y, src1_stride_y,

-                   dst_y, dst_stride_y,

-                   width, height, interpolation);

-  InterpolatePlane(src0_u, src0_stride_u,

-                   src1_u, src1_stride_u,

-                   dst_u, dst_stride_u,

-                   halfwidth, halfheight, interpolation);

-  InterpolatePlane(src0_v, src0_stride_v,

-                   src1_v, src1_stride_v,

-                   dst_v, dst_stride_v,

-                   halfwidth, halfheight, interpolation);

+  InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,

+                   dst_stride_y, width, height, interpolation);

+  InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,

+                   dst_stride_u, halfwidth, halfheight, interpolation);

+  InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,

+                   dst_stride_v, halfwidth, halfheight, interpolation);

   return 0;

 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.

 LIBYUV_API

-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,

-                uint8* dst_argb, int dst_stride_argb,

-                const uint8* shuffler, int width, int height) {

+int ARGBShuffle(const uint8_t* src_bgra,

+                int src_stride_bgra,

+                uint8_t* dst_argb,

+                int dst_stride_argb,

+                const uint8_t* shuffler,

+                int width,

+                int height) {

   int y;

-  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,

-                         const uint8* shuffler, int width) = ARGBShuffleRow_C;

-  if (!src_bgra || !dst_argb ||

-      width <= 0 || height == 0) {

+  void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,

+                         const uint8_t* shuffler, int width) = ARGBShuffleRow_C;

+  if (!src_bgra || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -2131,20 +2693,11 @@

     src_stride_bgra = -src_stride_bgra;

   // Coalesce rows.

-  if (src_stride_bgra == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_bgra = dst_stride_argb = 0;

-#if defined(HAS_ARGBSHUFFLEROW_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2)) {

-    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;

-    if (IS_ALIGNED(width, 4)) {

-      ARGBShuffleRow = ARGBShuffleRow_SSE2;

-    }

-  }

-#endif

 #if defined(HAS_ARGBSHUFFLEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;

@@ -2169,6 +2722,14 @@

 #endif

+#if defined(HAS_ARGBSHUFFLEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBShuffleRow = ARGBShuffleRow_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      ARGBShuffleRow = ARGBShuffleRow_MSA;

+    }

+  }

+#endif

   for (y = 0; y < height; ++y) {

     ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);

@@ -2179,28 +2740,32 @@

 // Sobel ARGB effect.

-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,

-                        uint8* dst_argb, int dst_stride_argb,

-                        int width, int height,

-                        void (*SobelRow)(const uint8* src_sobelx,

-                                         const uint8* src_sobely,

-                                         uint8* dst, int width)) {

+static int ARGBSobelize(const uint8_t* src_argb,

+                        int src_stride_argb,

+                        uint8_t* dst_argb,

+                        int dst_stride_argb,

+                        int width,

+                        int height,

+                        void (*SobelRow)(const uint8_t* src_sobelx,

+                                         const uint8_t* src_sobely,

+                                         uint8_t* dst,

+                                         int width)) {

   int y;

-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =

+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =

       ARGBToYJRow_C;

-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,

-                    uint8* dst_sobely, int width) = SobelYRow_C;

-  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,

-                    const uint8* src_y2, uint8* dst_sobely, int width) =

+  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,

+                    uint8_t* dst_sobely, int width) = SobelYRow_C;

+  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,

+                    const uint8_t* src_y2, uint8_t* dst_sobely, int width) =

       SobelXRow_C;

   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.

-  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {

+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    src_argb  = src_argb  + (height - 1) * src_stride_argb;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

     src_stride_argb = -src_stride_argb;

@@ -2228,6 +2793,14 @@

 #endif

+#if defined(HAS_ARGBTOYJROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBToYJRow = ARGBToYJRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBToYJRow = ARGBToYJRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_SOBELYROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

@@ -2239,6 +2812,11 @@

     SobelYRow = SobelYRow_NEON;

 #endif

+#if defined(HAS_SOBELYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SobelYRow = SobelYRow_MSA;

+  }

+#endif

 #if defined(HAS_SOBELXROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     SobelXRow = SobelXRow_SSE2;

@@ -2249,18 +2827,23 @@

     SobelXRow = SobelXRow_NEON;

 #endif

+#if defined(HAS_SOBELXROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SobelXRow = SobelXRow_MSA;

+  }

+#endif

     // 3 rows with edges before/after.

     const int kRowSize = (width + kEdge + 31) & ~31;

     align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));

-    uint8* row_sobelx = rows;

-    uint8* row_sobely = rows + kRowSize;

-    uint8* row_y = rows + kRowSize * 2;

+    uint8_t* row_sobelx = rows;

+    uint8_t* row_sobely = rows + kRowSize;

+    uint8_t* row_y = rows + kRowSize * 2;

     // Convert first row.

-    uint8* row_y0 = row_y + kEdge;

-    uint8* row_y1 = row_y0 + kRowSize;

-    uint8* row_y2 = row_y1 + kRowSize;

+    uint8_t* row_y0 = row_y + kEdge;

+    uint8_t* row_y1 = row_y0 + kRowSize;

+    uint8_t* row_y2 = row_y1 + kRowSize;

     ARGBToYJRow(src_argb, row_y0, width);

     row_y0[-1] = row_y0[0];

     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.

@@ -2284,7 +2867,7 @@

       // Cycle thru circular queue of 3 row_y buffers.

-        uint8* row_yt = row_y0;

+        uint8_t* row_yt = row_y0;

         row_y0 = row_y1;

         row_y1 = row_y2;

         row_y2 = row_yt;

@@ -2299,11 +2882,14 @@

 // Sobel ARGB effect.

 LIBYUV_API

-int ARGBSobel(const uint8* src_argb, int src_stride_argb,

-              uint8* dst_argb, int dst_stride_argb,

-              int width, int height) {

-  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,

-                   uint8* dst_argb, int width) = SobelRow_C;

+int ARGBSobel(const uint8_t* src_argb,

+              int src_stride_argb,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int width,

+              int height) {

+  void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,

+                   uint8_t* dst_argb, int width) = SobelRow_C;

 #if defined(HAS_SOBELROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     SobelRow = SobelRow_Any_SSE2;

@@ -2320,6 +2906,14 @@

 #endif

+#if defined(HAS_SOBELROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SobelRow = SobelRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      SobelRow = SobelRow_MSA;

+    }

+  }

+#endif

   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,

                       width, height, SobelRow);

@@ -2326,11 +2920,14 @@

 // Sobel ARGB effect with planar output.

 LIBYUV_API

-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,

-                     uint8* dst_y, int dst_stride_y,

-                     int width, int height) {

-  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,

-                          uint8* dst_, int width) = SobelToPlaneRow_C;

+int ARGBSobelToPlane(const uint8_t* src_argb,

+                     int src_stride_argb,

+                     uint8_t* dst_y,

+                     int dst_stride_y,

+                     int width,

+                     int height) {

+  void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,

+                          uint8_t* dst_, int width) = SobelToPlaneRow_C;

 #if defined(HAS_SOBELTOPLANEROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;

@@ -2347,18 +2944,29 @@

 #endif

-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,

-                      width, height, SobelToPlaneRow);

+#if defined(HAS_SOBELTOPLANEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SobelToPlaneRow = SobelToPlaneRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      SobelToPlaneRow = SobelToPlaneRow_MSA;

+    }

+  }

+#endif

+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,

+                      height, SobelToPlaneRow);

 // SobelXY ARGB effect.

 // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.

 LIBYUV_API

-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,

-                uint8* dst_argb, int dst_stride_argb,

-                int width, int height) {

-  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width) = SobelXYRow_C;

+int ARGBSobelXY(const uint8_t* src_argb,

+                int src_stride_argb,

+                uint8_t* dst_argb,

+                int dst_stride_argb,

+                int width,

+                int height) {

+  void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,

+                     uint8_t* dst_argb, int width) = SobelXYRow_C;

 #if defined(HAS_SOBELXYROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     SobelXYRow = SobelXYRow_Any_SSE2;

@@ -2375,6 +2983,14 @@

 #endif

+#if defined(HAS_SOBELXYROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SobelXYRow = SobelXYRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      SobelXYRow = SobelXYRow_MSA;

+    }

+  }

+#endif

   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,

                       width, height, SobelXYRow);

@@ -2381,14 +2997,16 @@

 // Apply a 4x4 polynomial to each ARGB pixel.

 LIBYUV_API

-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,

-                   uint8* dst_argb, int dst_stride_argb,

+int ARGBPolynomial(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

                    const float* poly,

-                   int width, int height) {

+                   int width,

+                   int height) {

   int y;

-  void (*ARGBPolynomialRow)(const uint8* src_argb,

-                            uint8* dst_argb, const float* poly,

-                            int width) = ARGBPolynomialRow_C;

+  void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,

+                            const float* poly, int width) = ARGBPolynomialRow_C;

   if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {

     return -1;

@@ -2395,12 +3013,11 @@

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    src_argb  = src_argb  + (height - 1) * src_stride_argb;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -2425,16 +3042,121 @@

   return 0;

+// Convert plane of 16 bit shorts to half floats.

+// Source values are multiplied by scale before storing as half float.

+LIBYUV_API

+int HalfFloatPlane(const uint16_t* src_y,

+                   int src_stride_y,

+                   uint16_t* dst_y,

+                   int dst_stride_y,

+                   float scale,

+                   int width,

+                   int height) {

+  int y;

+  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,

+                       int width) = HalfFloatRow_C;

+  if (!src_y || !dst_y || width <= 0 || height == 0) {

+    return -1;

+  }

+  src_stride_y >>= 1;

+  dst_stride_y >>= 1;

+  // Negative height means invert the image.

+  if (height < 0) {

+    height = -height;

+    src_y = src_y + (height - 1) * src_stride_y;

+    src_stride_y = -src_stride_y;

+  }

+  // Coalesce rows.

+  if (src_stride_y == width && dst_stride_y == width) {

+    width *= height;

+    height = 1;

+    src_stride_y = dst_stride_y = 0;

+  }

+#if defined(HAS_HALFFLOATROW_SSE2)

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    HalfFloatRow = HalfFloatRow_Any_SSE2;

+    if (IS_ALIGNED(width, 8)) {

+      HalfFloatRow = HalfFloatRow_SSE2;

+    }

+  }

+#endif

+#if defined(HAS_HALFFLOATROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    HalfFloatRow = HalfFloatRow_Any_AVX2;

+    if (IS_ALIGNED(width, 16)) {

+      HalfFloatRow = HalfFloatRow_AVX2;

+    }

+  }

+#endif

+#if defined(HAS_HALFFLOATROW_F16C)

+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {

+    HalfFloatRow =

+        (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;

+    if (IS_ALIGNED(width, 16)) {

+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;

+    }

+  }

+#endif

+#if defined(HAS_HALFFLOATROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    HalfFloatRow =

+        (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_HALFFLOATROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    HalfFloatRow = HalfFloatRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      HalfFloatRow = HalfFloatRow_MSA;

+    }

+  }

+#endif

+  for (y = 0; y < height; ++y) {

+    HalfFloatRow(src_y, dst_y, scale, width);

+    src_y += src_stride_y;

+    dst_y += dst_stride_y;

+  }

+  return 0;

+}

+// Convert a buffer of bytes to floats, scale the values and store as floats.

+LIBYUV_API

+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {

+  void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,

+                         int width) = ByteToFloatRow_C;

+  if (!src_y || !dst_y || width <= 0) {

+    return -1;

+  }

+#if defined(HAS_BYTETOFLOATROW_NEON)

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ByteToFloatRow = ByteToFloatRow_Any_NEON;

+    if (IS_ALIGNED(width, 8)) {

+      ByteToFloatRow = ByteToFloatRow_NEON;

+    }

+  }

+#endif

+  ByteToFloatRow(src_y, dst_y, scale, width);

+  return 0;

+}

 // Apply a lumacolortable to each ARGB pixel.

 LIBYUV_API

-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_argb, int dst_stride_argb,

-                       const uint8* luma,

-                       int width, int height) {

+int ARGBLumaColorTable(const uint8_t* src_argb,

+                       int src_stride_argb,

+                       uint8_t* dst_argb,

+                       int dst_stride_argb,

+                       const uint8_t* luma,

+                       int width,

+                       int height) {

   int y;

-  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,

-      int width, const uint8* luma, const uint32 lumacoeff) =

-      ARGBLumaColorTableRow_C;

+  void (*ARGBLumaColorTableRow)(

+      const uint8_t* src_argb, uint8_t* dst_argb, int width,

+      const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;

   if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {

     return -1;

@@ -2441,12 +3163,11 @@

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    src_argb  = src_argb  + (height - 1) * src_stride_argb;

+    src_argb = src_argb + (height - 1) * src_stride_argb;

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -2467,12 +3188,15 @@

 // Copy Alpha from one ARGB image to another.

 LIBYUV_API

-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,

-                  uint8* dst_argb, int dst_stride_argb,

-                  int width, int height) {

+int ARGBCopyAlpha(const uint8_t* src_argb,

+                  int src_stride_argb,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int width,

+                  int height) {

   int y;

-  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =

-      ARGBCopyAlphaRow_C;

+  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,

+                           int width) = ARGBCopyAlphaRow_C;

   if (!src_argb || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -2483,8 +3207,7 @@

     src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride_argb == width * 4 &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_argb = dst_stride_argb = 0;

@@ -2516,9 +3239,12 @@

 // Extract just the alpha channel from ARGB.

 LIBYUV_API

-int ARGBExtractAlpha(const uint8* src_argb, int src_stride,

-                     uint8* dst_a, int dst_stride,

-                     int width, int height) {

+int ARGBExtractAlpha(const uint8_t* src_argb,

+                     int src_stride_argb,

+                     uint8_t* dst_a,

+                     int dst_stride_a,

+                     int width,

+                     int height) {

   if (!src_argb || !dst_a || width <= 0 || height == 0) {

     return -1;

@@ -2525,17 +3251,17 @@

   // Negative height means invert the image.

   if (height < 0) {

     height = -height;

-    src_argb += (height - 1) * src_stride;

-    src_stride = -src_stride;

+    src_argb += (height - 1) * src_stride_argb;

+    src_stride_argb = -src_stride_argb;

   // Coalesce rows.

-  if (src_stride == width * 4 && dst_stride == width) {

+  if (src_stride_argb == width * 4 && dst_stride_a == width) {

     width *= height;

     height = 1;

-    src_stride = dst_stride = 0;

+    src_stride_argb = dst_stride_a = 0;

-  void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =

-      ARGBExtractAlphaRow_C;

+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,

+                              int width) = ARGBExtractAlphaRow_C;

 #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2

@@ -2542,6 +3268,12 @@

                                                : ARGBExtractAlphaRow_Any_SSE2;

 #endif

+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)

+  if (TestCpuFlag(kCpuHasAVX2)) {

+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2

+                                                : ARGBExtractAlphaRow_Any_AVX2;

+  }

+#endif

 #if defined(HAS_ARGBEXTRACTALPHAROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON

@@ -2548,11 +3280,17 @@

                                                 : ARGBExtractAlphaRow_Any_NEON;

 #endif

+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA

+                                                : ARGBExtractAlphaRow_Any_MSA;

+  }

+#endif

   for (int y = 0; y < height; ++y) {

     ARGBExtractAlphaRow(src_argb, dst_a, width);

-    src_argb += src_stride;

-    dst_a += dst_stride;

+    src_argb += src_stride_argb;

+    dst_a += dst_stride_a;

   return 0;

@@ -2559,12 +3297,15 @@

 // Copy a planar Y channel to the alpha channel of a destination ARGB image.

 LIBYUV_API

-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,

-                     uint8* dst_argb, int dst_stride_argb,

-                     int width, int height) {

+int ARGBCopyYToAlpha(const uint8_t* src_y,

+                     int src_stride_y,

+                     uint8_t* dst_argb,

+                     int dst_stride_argb,

+                     int width,

+                     int height) {

   int y;

-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =

-      ARGBCopyYToAlphaRow_C;

+  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,

+                              int width) = ARGBCopyYToAlphaRow_C;

   if (!src_y || !dst_argb || width <= 0 || height == 0) {

     return -1;

@@ -2575,8 +3316,7 @@

     src_stride_y = -src_stride_y;

   // Coalesce rows.

-  if (src_stride_y == width &&

-      dst_stride_argb == width * 4) {

+  if (src_stride_y == width && dst_stride_argb == width * 4) {

     width *= height;

     height = 1;

     src_stride_y = dst_stride_argb = 0;

@@ -2610,20 +3350,22 @@

 // directly. A SplitUVRow_Odd function could copy the remaining chroma.

 LIBYUV_API

-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height) {

+int YUY2ToNV12(const uint8_t* src_yuy2,

+               int src_stride_yuy2,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height) {

   int y;

   int halfwidth = (width + 1) >> 1;

-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,

                      int width) = SplitUVRow_C;

-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,

                          ptrdiff_t src_stride, int dst_width,

                          int source_y_fraction) = InterpolateRow_C;

-  if (!src_yuy2 ||

-      !dst_y || !dst_uv ||

-      width <= 0 || height == 0) {

+  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -2656,6 +3398,14 @@

 #endif

+#if defined(HAS_SPLITUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SplitUVRow = SplitUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      SplitUVRow = SplitUVRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -2680,6 +3430,14 @@

 #endif

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      InterpolateRow = InterpolateRow_MSA;

+    }

+  }

+#endif

     int awidth = halfwidth * 2;

@@ -2708,20 +3466,22 @@

 LIBYUV_API

-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_uv, int dst_stride_uv,

-               int width, int height) {

+int UYVYToNV12(const uint8_t* src_uyvy,

+               int src_stride_uyvy,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_uv,

+               int dst_stride_uv,

+               int width,

+               int height) {

   int y;

   int halfwidth = (width + 1) >> 1;

-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,

                      int width) = SplitUVRow_C;

-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,

                          ptrdiff_t src_stride, int dst_width,

                          int source_y_fraction) = InterpolateRow_C;

-  if (!src_uyvy ||

-      !dst_y || !dst_uv ||

-      width <= 0 || height == 0) {

+  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {

     return -1;

   // Negative height means invert the image.

@@ -2754,6 +3514,14 @@

 #endif

+#if defined(HAS_SPLITUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    SplitUVRow = SplitUVRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      SplitUVRow = SplitUVRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -2775,6 +3543,14 @@

     InterpolateRow = InterpolateRow_Any_NEON;

     if (IS_ALIGNED(width, 16)) {

       InterpolateRow = InterpolateRow_NEON;

+    }

+  }

+#endif

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(width, 32)) {

+      InterpolateRow = InterpolateRow_MSA;

 #endif

--- a/third_party/libyuv/source/rotate.cc

+++ b/third_party/libyuv/source/rotate.cc

@@ -10,8 +10,8 @@

 #include "libyuv/rotate.h"

-#include "libyuv/cpu_id.h"

 #include "libyuv/convert.h"

+#include "libyuv/cpu_id.h"

 #include "libyuv/planar_functions.h"

 #include "libyuv/rotate_row.h"

 #include "libyuv/row.h"

@@ -22,12 +22,20 @@

 #endif

 LIBYUV_API

-void TransposePlane(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride,

-                    int width, int height) {

+void TransposePlane(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height) {

   int i = height;

-  void (*TransposeWx8)(const uint8* src, int src_stride,

-                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;

+#if defined(HAS_TRANSPOSEWX16_MSA)

+  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,

+                        int dst_stride, int width) = TransposeWx16_C;

+#else

+  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,

+                       int dst_stride, int width) = TransposeWx8_C;

+#endif

 #if defined(HAS_TRANSPOSEWX8_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     TransposeWx8 = TransposeWx8_NEON;

@@ -49,24 +57,32 @@

 #endif

-#if defined(HAS_TRANSPOSEWX8_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2)) {

-    if (IS_ALIGNED(width, 4) &&

-        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

-      TransposeWx8 = TransposeWx8_Fast_DSPR2;

-    } else {

-      TransposeWx8 = TransposeWx8_DSPR2;

+#if defined(HAS_TRANSPOSEWX16_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    TransposeWx16 = TransposeWx16_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      TransposeWx16 = TransposeWx16_MSA;

 #endif

+#if defined(HAS_TRANSPOSEWX16_MSA)

+  // Work across the source in 16x16 tiles

+  while (i >= 16) {

+    TransposeWx16(src, src_stride, dst, dst_stride, width);

+    src += 16 * src_stride;  // Go down 16 rows.

+    dst += 16;               // Move over 16 columns.

+    i -= 16;

+  }

+#else

   // Work across the source in 8x8 tiles

   while (i >= 8) {

     TransposeWx8(src, src_stride, dst, dst_stride, width);

-    src += 8 * src_stride;    // Go down 8 rows.

-    dst += 8;                 // Move over 8 columns.

+    src += 8 * src_stride;  // Go down 8 rows.

+    dst += 8;               // Move over 8 columns.

     i -= 8;

+#endif

   if (i > 0) {

     TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);

@@ -74,9 +90,12 @@

 LIBYUV_API

-void RotatePlane90(const uint8* src, int src_stride,

-                   uint8* dst, int dst_stride,

-                   int width, int height) {

+void RotatePlane90(const uint8_t* src,

+                   int src_stride,

+                   uint8_t* dst,

+                   int dst_stride,

+                   int width,

+                   int height) {

   // Rotate by 90 is a transpose with the source read

   // from bottom to top. So set the source pointer to the end

   // of the buffer and flip the sign of the source stride.

@@ -86,9 +105,12 @@

 LIBYUV_API

-void RotatePlane270(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride,

-                    int width, int height) {

+void RotatePlane270(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height) {

   // Rotate by 270 is a transpose with the destination written

   // from bottom to top. So set the destination pointer to the end

   // of the buffer and flip the sign of the destination stride.

@@ -98,17 +120,20 @@

 LIBYUV_API

-void RotatePlane180(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride,

-                    int width, int height) {

+void RotatePlane180(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height) {

   // Swap first and last row and mirror the content. Uses a temporary row.

   align_buffer_64(row, width);

-  const uint8* src_bot = src + src_stride * (height - 1);

-  uint8* dst_bot = dst + dst_stride * (height - 1);

+  const uint8_t* src_bot = src + src_stride * (height - 1);

+  uint8_t* dst_bot = dst + dst_stride * (height - 1);

   int half_height = (height + 1) >> 1;

   int y;

-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;

-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;

+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;

 #if defined(HAS_MIRRORROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     MirrorRow = MirrorRow_Any_NEON;

@@ -133,12 +158,12 @@

 #endif

-// TODO(fbarchard): Mirror on mips handle unaligned memory.

-#if defined(HAS_MIRRORROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {

-    MirrorRow = MirrorRow_DSPR2;

+#if defined(HAS_MIRRORROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    MirrorRow = MirrorRow_Any_MSA;

+    if (IS_ALIGNED(width, 64)) {

+      MirrorRow = MirrorRow_MSA;

+    }

 #endif

 #if defined(HAS_COPYROW_SSE2)

@@ -161,11 +186,6 @@

     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;

 #endif

-#if defined(HAS_COPYROW_MIPS)

-  if (TestCpuFlag(kCpuHasMIPS)) {

-    CopyRow = CopyRow_MIPS;

-  }

-#endif

   // Odd height will harmlessly mirror the middle row twice.

   for (y = 0; y < half_height; ++y) {

@@ -181,15 +201,24 @@

 LIBYUV_API

-void TransposeUV(const uint8* src, int src_stride,

-                 uint8* dst_a, int dst_stride_a,

-                 uint8* dst_b, int dst_stride_b,

-                 int width, int height) {

+void TransposeUV(const uint8_t* src,

+                 int src_stride,

+                 uint8_t* dst_a,

+                 int dst_stride_a,

+                 uint8_t* dst_b,

+                 int dst_stride_b,

+                 int width,

+                 int height) {

   int i = height;

-  void (*TransposeUVWx8)(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b,

+#if defined(HAS_TRANSPOSEUVWX16_MSA)

+  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,

+                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,

+                          int width) = TransposeUVWx16_C;

+#else

+  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,

+                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,

                          int width) = TransposeUVWx8_C;

+#endif

 #if defined(HAS_TRANSPOSEUVWX8_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     TransposeUVWx8 = TransposeUVWx8_NEON;

@@ -203,72 +232,90 @@

 #endif

-#if defined(HAS_TRANSPOSEUVWX8_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&

-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

-    TransposeUVWx8 = TransposeUVWx8_DSPR2;

+#if defined(HAS_TRANSPOSEUVWX16_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;

+    if (IS_ALIGNED(width, 8)) {

+      TransposeUVWx16 = TransposeUVWx16_MSA;

+    }

 #endif

+#if defined(HAS_TRANSPOSEUVWX16_MSA)

   // Work through the source in 8x8 tiles.

+  while (i >= 16) {

+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,

+                    width);

+    src += 16 * src_stride;  // Go down 16 rows.

+    dst_a += 16;             // Move over 8 columns.

+    dst_b += 16;             // Move over 8 columns.

+    i -= 16;

+  }

+#else

+  // Work through the source in 8x8 tiles.

   while (i >= 8) {

-    TransposeUVWx8(src, src_stride,

-                   dst_a, dst_stride_a,

-                   dst_b, dst_stride_b,

+    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,

                    width);

-    src += 8 * src_stride;    // Go down 8 rows.

-    dst_a += 8;               // Move over 8 columns.

-    dst_b += 8;               // Move over 8 columns.

+    src += 8 * src_stride;  // Go down 8 rows.

+    dst_a += 8;             // Move over 8 columns.

+    dst_b += 8;             // Move over 8 columns.

     i -= 8;

+#endif

   if (i > 0) {

-    TransposeUVWxH_C(src, src_stride,

-                     dst_a, dst_stride_a,

-                     dst_b, dst_stride_b,

+    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,

                      width, i);

 LIBYUV_API

-void RotateUV90(const uint8* src, int src_stride,

-                uint8* dst_a, int dst_stride_a,

-                uint8* dst_b, int dst_stride_b,

-                int width, int height) {

+void RotateUV90(const uint8_t* src,

+                int src_stride,

+                uint8_t* dst_a,

+                int dst_stride_a,

+                uint8_t* dst_b,

+                int dst_stride_b,

+                int width,

+                int height) {

   src += src_stride * (height - 1);

   src_stride = -src_stride;

-  TransposeUV(src, src_stride,

-              dst_a, dst_stride_a,

-              dst_b, dst_stride_b,

-              width, height);

+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,

+              height);

 LIBYUV_API

-void RotateUV270(const uint8* src, int src_stride,

-                 uint8* dst_a, int dst_stride_a,

-                 uint8* dst_b, int dst_stride_b,

-                 int width, int height) {

+void RotateUV270(const uint8_t* src,

+                 int src_stride,

+                 uint8_t* dst_a,

+                 int dst_stride_a,

+                 uint8_t* dst_b,

+                 int dst_stride_b,

+                 int width,

+                 int height) {

   dst_a += dst_stride_a * (width - 1);

   dst_b += dst_stride_b * (width - 1);

   dst_stride_a = -dst_stride_a;

   dst_stride_b = -dst_stride_b;

-  TransposeUV(src, src_stride,

-              dst_a, dst_stride_a,

-              dst_b, dst_stride_b,

-              width, height);

+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,

+              height);

 // Rotate 180 is a horizontal and vertical flip.

 LIBYUV_API

-void RotateUV180(const uint8* src, int src_stride,

-                 uint8* dst_a, int dst_stride_a,

-                 uint8* dst_b, int dst_stride_b,

-                 int width, int height) {

+void RotateUV180(const uint8_t* src,

+                 int src_stride,

+                 uint8_t* dst_a,

+                 int dst_stride_a,

+                 uint8_t* dst_b,

+                 int dst_stride_b,

+                 int width,

+                 int height) {

   int i;

-  void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =

-      MirrorUVRow_C;

+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,

+                      int width) = MirrorUVRow_C;

 #if defined(HAS_MIRRORUVROW_NEON)

   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {

     MirrorUVRow = MirrorUVRow_NEON;

@@ -279,10 +326,9 @@

     MirrorUVRow = MirrorUVRow_SSSE3;

 #endif

-#if defined(HAS_MIRRORUVROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {

-    MirrorUVRow = MirrorUVRow_DSPR2;

+#if defined(HAS_MIRRORUVROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {

+    MirrorUVRow = MirrorUVRow_MSA;

 #endif

@@ -298,9 +344,12 @@

 LIBYUV_API

-int RotatePlane(const uint8* src, int src_stride,

-                uint8* dst, int dst_stride,

-                int width, int height,

+int RotatePlane(const uint8_t* src,

+                int src_stride,

+                uint8_t* dst,

+                int dst_stride,

+                int width,

+                int height,

                 enum RotationMode mode) {

   if (!src || width <= 0 || height == 0 || !dst) {

     return -1;

@@ -316,24 +365,16 @@

   switch (mode) {

     case kRotate0:

       // copy frame

-      CopyPlane(src, src_stride,

-                dst, dst_stride,

-                width, height);

+      CopyPlane(src, src_stride, dst, dst_stride, width, height);

       return 0;

     case kRotate90:

-      RotatePlane90(src, src_stride,

-                    dst, dst_stride,

-                    width, height);

+      RotatePlane90(src, src_stride, dst, dst_stride, width, height);

       return 0;

     case kRotate270:

-      RotatePlane270(src, src_stride,

-                     dst, dst_stride,

-                     width, height);

+      RotatePlane270(src, src_stride, dst, dst_stride, width, height);

       return 0;

     case kRotate180:

-      RotatePlane180(src, src_stride,

-                     dst, dst_stride,

-                     width, height);

+      RotatePlane180(src, src_stride, dst, dst_stride, width, height);

       return 0;

     default:

       break;

@@ -342,18 +383,25 @@

 LIBYUV_API

-int I420Rotate(const uint8* src_y, int src_stride_y,

-               const uint8* src_u, int src_stride_u,

-               const uint8* src_v, int src_stride_v,

-               uint8* dst_y, int dst_stride_y,

-               uint8* dst_u, int dst_stride_u,

-               uint8* dst_v, int dst_stride_v,

-               int width, int height,

+int I420Rotate(const uint8_t* src_y,

+               int src_stride_y,

+               const uint8_t* src_u,

+               int src_stride_u,

+               const uint8_t* src_v,

+               int src_stride_v,

+               uint8_t* dst_y,

+               int dst_stride_y,

+               uint8_t* dst_u,

+               int dst_stride_u,

+               uint8_t* dst_v,

+               int dst_stride_v,

+               int width,

+               int height,

                enum RotationMode mode) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||

-      !dst_y || !dst_u || !dst_v) {

+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||

+      !dst_u || !dst_v) {

     return -1;

@@ -372,45 +420,29 @@

   switch (mode) {

     case kRotate0:

       // copy frame

-      return I420Copy(src_y, src_stride_y,

-                      src_u, src_stride_u,

-                      src_v, src_stride_v,

-                      dst_y, dst_stride_y,

-                      dst_u, dst_stride_u,

-                      dst_v, dst_stride_v,

-                      width, height);

+      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,

+                      dst_v, dst_stride_v, width, height);

     case kRotate90:

-      RotatePlane90(src_y, src_stride_y,

-                    dst_y, dst_stride_y,

-                    width, height);

-      RotatePlane90(src_u, src_stride_u,

-                    dst_u, dst_stride_u,

-                    halfwidth, halfheight);

-      RotatePlane90(src_v, src_stride_v,

-                    dst_v, dst_stride_v,

-                    halfwidth, halfheight);

+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,

+                    halfheight);

+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,

+                    halfheight);

       return 0;

     case kRotate270:

-      RotatePlane270(src_y, src_stride_y,

-                     dst_y, dst_stride_y,

-                     width, height);

-      RotatePlane270(src_u, src_stride_u,

-                     dst_u, dst_stride_u,

-                     halfwidth, halfheight);

-      RotatePlane270(src_v, src_stride_v,

-                     dst_v, dst_stride_v,

-                     halfwidth, halfheight);

+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,

+                     halfheight);

+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,

+                     halfheight);

       return 0;

     case kRotate180:

-      RotatePlane180(src_y, src_stride_y,

-                     dst_y, dst_stride_y,

-                     width, height);

-      RotatePlane180(src_u, src_stride_u,

-                     dst_u, dst_stride_u,

-                     halfwidth, halfheight);

-      RotatePlane180(src_v, src_stride_v,

-                     dst_v, dst_stride_v,

-                     halfwidth, halfheight);

+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,

+                     halfheight);

+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,

+                     halfheight);

       return 0;

     default:

       break;

@@ -419,17 +451,23 @@

 LIBYUV_API

-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,

-                     const uint8* src_uv, int src_stride_uv,

-                     uint8* dst_y, int dst_stride_y,

-                     uint8* dst_u, int dst_stride_u,

-                     uint8* dst_v, int dst_stride_v,

-                     int width, int height,

+int NV12ToI420Rotate(const uint8_t* src_y,

+                     int src_stride_y,

+                     const uint8_t* src_uv,

+                     int src_stride_uv,

+                     uint8_t* dst_y,

+                     int dst_stride_y,

+                     uint8_t* dst_u,

+                     int dst_stride_u,

+                     uint8_t* dst_v,

+                     int dst_stride_v,

+                     int width,

+                     int height,

                      enum RotationMode mode) {

   int halfwidth = (width + 1) >> 1;

   int halfheight = (height + 1) >> 1;

-  if (!src_y || !src_uv || width <= 0 || height == 0 ||

-      !dst_y || !dst_u || !dst_v) {

+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||

+      !dst_v) {

     return -1;

@@ -446,38 +484,23 @@

   switch (mode) {

     case kRotate0:

       // copy frame

-      return NV12ToI420(src_y, src_stride_y,

-                        src_uv, src_stride_uv,

-                        dst_y, dst_stride_y,

-                        dst_u, dst_stride_u,

-                        dst_v, dst_stride_v,

+      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,

+                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,

                         width, height);

     case kRotate90:

-      RotatePlane90(src_y, src_stride_y,

-                    dst_y, dst_stride_y,

-                    width, height);

-      RotateUV90(src_uv, src_stride_uv,

-                 dst_u, dst_stride_u,

-                 dst_v, dst_stride_v,

-                 halfwidth, halfheight);

+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,

+                 dst_stride_v, halfwidth, halfheight);

       return 0;

     case kRotate270:

-      RotatePlane270(src_y, src_stride_y,

-                     dst_y, dst_stride_y,

-                     width, height);

-      RotateUV270(src_uv, src_stride_uv,

-                  dst_u, dst_stride_u,

-                  dst_v, dst_stride_v,

-                  halfwidth, halfheight);

+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,

+                  dst_stride_v, halfwidth, halfheight);

       return 0;

     case kRotate180:

-      RotatePlane180(src_y, src_stride_y,

-                     dst_y, dst_stride_y,

-                     width, height);

-      RotateUV180(src_uv, src_stride_uv,

-                  dst_u, dst_stride_u,

-                  dst_v, dst_stride_v,

-                  halfwidth, halfheight);

+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);

+      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,

+                  dst_stride_v, halfwidth, halfheight);

       return 0;

     default:

       break;

--- a/third_party/libyuv/source/rotate_any.cc

+++ b/third_party/libyuv/source/rotate_any.cc

@@ -18,16 +18,16 @@

 extern "C" {

 #endif

-#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \

-    void NAMEANY(const uint8* src, int src_stride,                             \

-                 uint8* dst, int dst_stride, int width) {                      \

-      int r = width & MASK;                                                    \

-      int n = width - r;                                                       \

-      if (n > 0) {                                                             \

-        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \

-      }                                                                        \

-      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\

-    }

+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \

+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \

+               int dst_stride, int width) {                                   \

+    int r = width & MASK;                                                     \

+    int n = width - r;                                                        \

+    if (n > 0) {                                                              \

+      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \

+    }                                                                         \

+    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \

+  }

 #ifdef HAS_TRANSPOSEWX8_NEON

 TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)

@@ -38,25 +38,23 @@

 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3

 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)

 #endif

-#ifdef HAS_TRANSPOSEWX8_DSPR2

-TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)

+#ifdef HAS_TRANSPOSEWX16_MSA

+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)

 #endif

 #undef TANY

 #define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \

-    void NAMEANY(const uint8* src, int src_stride,                             \

-                uint8* dst_a, int dst_stride_a,                                \

-                uint8* dst_b, int dst_stride_b, int width) {                   \

-      int r = width & MASK;                                                    \

-      int n = width - r;                                                       \

-      if (n > 0) {                                                             \

-        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \

-                  n);                                                          \

-      }                                                                        \

-      TransposeUVWx8_C(src + n * 2, src_stride,                                \

-                       dst_a + n * dst_stride_a, dst_stride_a,                 \

-                       dst_b + n * dst_stride_b, dst_stride_b, r);             \

-    }

+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \

+               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \

+               int width) {                                                    \

+    int r = width & MASK;                                                      \

+    int n = width - r;                                                         \

+    if (n > 0) {                                                               \

+      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \

+    }                                                                          \

+    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \

+                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \

+  }

 #ifdef HAS_TRANSPOSEUVWX8_NEON

 TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)

@@ -64,8 +62,8 @@

 #ifdef HAS_TRANSPOSEUVWX8_SSE2

 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)

 #endif

-#ifdef HAS_TRANSPOSEUVWX8_DSPR2

-TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)

+#ifdef HAS_TRANSPOSEUVWX16_MSA

+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)

 #endif

 #undef TUVANY

@@ -73,8 +71,3 @@

 }  // extern "C"

 }  // namespace libyuv

 #endif

--- a/third_party/libyuv/source/rotate_argb.cc

+++ b/third_party/libyuv/source/rotate_argb.cc

@@ -10,10 +10,11 @@

 #include "libyuv/rotate.h"

-#include "libyuv/cpu_id.h"

 #include "libyuv/convert.h"

+#include "libyuv/cpu_id.h"

 #include "libyuv/planar_functions.h"

 #include "libyuv/row.h"

+#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */

 #ifdef __cplusplus

 namespace libyuv {

@@ -20,80 +21,95 @@

 extern "C" {

 #endif

-// ARGBScale has a function to copy pixels to a row, striding each source

-// pixel by a constant.

-#if !defined(LIBYUV_DISABLE_X86) && \

-    (defined(_M_IX86) || \

-    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))

-#define HAS_SCALEARGBROWDOWNEVEN_SSE2

-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,

-                               int src_stepx, uint8* dst_ptr, int dst_width);

-#endif

-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \

-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))

-#define HAS_SCALEARGBROWDOWNEVEN_NEON

-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,

-                               int src_stepx, uint8* dst_ptr, int dst_width);

-#endif

-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,

-                            int src_stepx, uint8* dst_ptr, int dst_width);

-static void ARGBTranspose(const uint8* src, int src_stride,

-                          uint8* dst, int dst_stride, int width, int height) {

+static void ARGBTranspose(const uint8_t* src_argb,

+                          int src_stride_argb,

+                          uint8_t* dst_argb,

+                          int dst_stride_argb,

+                          int width,

+                          int height) {

   int i;

-  int src_pixel_step = src_stride >> 2;

-  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,

-      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;

+  int src_pixel_step = src_stride_argb >> 2;

+  void (*ScaleARGBRowDownEven)(

+      const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,

+      uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;

 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)

-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.

-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;

+  if (TestCpuFlag(kCpuHasSSE2)) {

+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;

+    if (IS_ALIGNED(height, 4)) {  // Width of dest.

+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;

+    }

 #endif

 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)

-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.

-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;

+  if (TestCpuFlag(kCpuHasNEON)) {

+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;

+    if (IS_ALIGNED(height, 4)) {  // Width of dest.

+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;

+    }

 #endif

+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;

+    if (IS_ALIGNED(height, 4)) {  // Width of dest.

+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;

+    }

+  }

+#endif

   for (i = 0; i < width; ++i) {  // column of source to row of dest.

-    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);

-    dst += dst_stride;

-    src += 4;

+    ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);

+    dst_argb += dst_stride_argb;

+    src_argb += 4;

-void ARGBRotate90(const uint8* src, int src_stride,

-                  uint8* dst, int dst_stride, int width, int height) {

+void ARGBRotate90(const uint8_t* src_argb,

+                  int src_stride_argb,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int width,

+                  int height) {

   // Rotate by 90 is a ARGBTranspose with the source read

   // from bottom to top. So set the source pointer to the end

   // of the buffer and flip the sign of the source stride.

-  src += src_stride * (height - 1);

-  src_stride = -src_stride;

-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);

+  src_argb += src_stride_argb * (height - 1);

+  src_stride_argb = -src_stride_argb;

+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,

+                height);

-void ARGBRotate270(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride, int width, int height) {

+void ARGBRotate270(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   int width,

+                   int height) {

   // Rotate by 270 is a ARGBTranspose with the destination written

   // from bottom to top. So set the destination pointer to the end

   // of the buffer and flip the sign of the destination stride.

-  dst += dst_stride * (width - 1);

-  dst_stride = -dst_stride;

-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);

+  dst_argb += dst_stride_argb * (width - 1);

+  dst_stride_argb = -dst_stride_argb;

+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,

+                height);

-void ARGBRotate180(const uint8* src, int src_stride,

-                   uint8* dst, int dst_stride, int width, int height) {

+void ARGBRotate180(const uint8_t* src_argb,

+                   int src_stride_argb,

+                   uint8_t* dst_argb,

+                   int dst_stride_argb,

+                   int width,

+                   int height) {

   // Swap first and last row and mirror the content. Uses a temporary row.

   align_buffer_64(row, width * 4);

-  const uint8* src_bot = src + src_stride * (height - 1);

-  uint8* dst_bot = dst + dst_stride * (height - 1);

+  const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);

+  uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);

   int half_height = (height + 1) >> 1;

   int y;

-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =

+  void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =

       ARGBMirrorRow_C;

-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;

+  void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =

+      CopyRow_C;

 #if defined(HAS_ARGBMIRRORROW_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;

@@ -118,6 +134,14 @@

 #endif

+#if defined(HAS_ARGBMIRRORROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;

+    if (IS_ALIGNED(width, 16)) {

+      ARGBMirrorRow = ARGBMirrorRow_MSA;

+    }

+  }

+#endif

 #if defined(HAS_COPYROW_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;

@@ -138,28 +162,27 @@

     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;

 #endif

-#if defined(HAS_COPYROW_MIPS)

-  if (TestCpuFlag(kCpuHasMIPS)) {

-    CopyRow = CopyRow_MIPS;

-  }

-#endif

   // Odd height will harmlessly mirror the middle row twice.

   for (y = 0; y < half_height; ++y) {

-    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer

-    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row

+    ARGBMirrorRow(src_argb, row, width);      // Mirror first row into a buffer

+    ARGBMirrorRow(src_bot, dst_argb, width);  // Mirror last row into first row

     CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last

-    src += src_stride;

-    dst += dst_stride;

-    src_bot -= src_stride;

-    dst_bot -= dst_stride;

+    src_argb += src_stride_argb;

+    dst_argb += dst_stride_argb;

+    src_bot -= src_stride_argb;

+    dst_bot -= dst_stride_argb;

   free_aligned_buffer_64(row);

 LIBYUV_API

-int ARGBRotate(const uint8* src_argb, int src_stride_argb,

-               uint8* dst_argb, int dst_stride_argb, int width, int height,

+int ARGBRotate(const uint8_t* src_argb,

+               int src_stride_argb,

+               uint8_t* dst_argb,

+               int dst_stride_argb,

+               int width,

+               int height,

                enum RotationMode mode) {

   if (!src_argb || width <= 0 || height == 0 || !dst_argb) {

     return -1;

@@ -175,23 +198,19 @@

   switch (mode) {

     case kRotate0:

       // copy frame

-      return ARGBCopy(src_argb, src_stride_argb,

-                      dst_argb, dst_stride_argb,

+      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,

                       width, height);

     case kRotate90:

-      ARGBRotate90(src_argb, src_stride_argb,

-                   dst_argb, dst_stride_argb,

-                   width, height);

+      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,

+                   height);

       return 0;

     case kRotate270:

-      ARGBRotate270(src_argb, src_stride_argb,

-                    dst_argb, dst_stride_argb,

-                    width, height);

+      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,

+                    height);

       return 0;

     case kRotate180:

-      ARGBRotate180(src_argb, src_stride_argb,

-                    dst_argb, dst_stride_argb,

-                    width, height);

+      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,

+                    height);

       return 0;

     default:

       break;

--- a/third_party/libyuv/source/rotate_common.cc

+++ b/third_party/libyuv/source/rotate_common.cc

@@ -8,8 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "libyuv/row.h"

 #include "libyuv/rotate_row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -16,8 +16,11 @@

 extern "C" {

 #endif

-void TransposeWx8_C(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride, int width) {

+void TransposeWx8_C(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width) {

   int i;

   for (i = 0; i < width; ++i) {

     dst[0] = src[0 * src_stride];

@@ -33,9 +36,13 @@

-void TransposeUVWx8_C(const uint8* src, int src_stride,

-                      uint8* dst_a, int dst_stride_a,

-                      uint8* dst_b, int dst_stride_b, int width) {

+void TransposeUVWx8_C(const uint8_t* src,

+                      int src_stride,

+                      uint8_t* dst_a,

+                      int dst_stride_a,

+                      uint8_t* dst_b,

+                      int dst_stride_b,

+                      int width) {

   int i;

   for (i = 0; i < width; ++i) {

     dst_a[0] = src[0 * src_stride + 0];

@@ -60,9 +67,12 @@

-void TransposeWxH_C(const uint8* src, int src_stride,

-                    uint8* dst, int dst_stride,

-                    int width, int height) {

+void TransposeWxH_C(const uint8_t* src,

+                    int src_stride,

+                    uint8_t* dst,

+                    int dst_stride,

+                    int width,

+                    int height) {

   int i;

   for (i = 0; i < width; ++i) {

     int j;

@@ -72,10 +82,14 @@

-void TransposeUVWxH_C(const uint8* src, int src_stride,

-                      uint8* dst_a, int dst_stride_a,

-                      uint8* dst_b, int dst_stride_b,

-                      int width, int height) {

+void TransposeUVWxH_C(const uint8_t* src,

+                      int src_stride,

+                      uint8_t* dst_a,

+                      int dst_stride_a,

+                      uint8_t* dst_b,

+                      int dst_stride_b,

+                      int width,

+                      int height) {

   int i;

   for (i = 0; i < width * 2; i += 2) {

     int j;

--- a/third_party/libyuv/source/rotate_gcc.cc

+++ b/third_party/libyuv/source/rotate_gcc.cc

@@ -8,8 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "libyuv/row.h"

 #include "libyuv/rotate_row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -22,342 +22,348 @@

 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.

 #if defined(HAS_TRANSPOSEWX8_SSSE3)

-void TransposeWx8_SSSE3(const uint8* src, int src_stride,

-                        uint8* dst, int dst_stride, int width) {

-  asm volatile (

-    // Read in the data from the source pointer.

-    // First round of bit swap.

-    LABELALIGN

-  "1:                                            \n"

-    "movq       (%0),%%xmm0                      \n"

-    "movq       (%0,%3),%%xmm1                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "punpcklbw  %%xmm1,%%xmm0                    \n"

-    "movq       (%0),%%xmm2                      \n"

-    "movdqa     %%xmm0,%%xmm1                    \n"

-    "palignr    $0x8,%%xmm1,%%xmm1               \n"

-    "movq       (%0,%3),%%xmm3                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "punpcklbw  %%xmm3,%%xmm2                    \n"

-    "movdqa     %%xmm2,%%xmm3                    \n"

-    "movq       (%0),%%xmm4                      \n"

-    "palignr    $0x8,%%xmm3,%%xmm3               \n"

-    "movq       (%0,%3),%%xmm5                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "punpcklbw  %%xmm5,%%xmm4                    \n"

-    "movdqa     %%xmm4,%%xmm5                    \n"

-    "movq       (%0),%%xmm6                      \n"

-    "palignr    $0x8,%%xmm5,%%xmm5               \n"

-    "movq       (%0,%3),%%xmm7                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "punpcklbw  %%xmm7,%%xmm6                    \n"

-    "neg        %3                               \n"

-    "movdqa     %%xmm6,%%xmm7                    \n"

-    "lea        0x8(%0,%3,8),%0                  \n"

-    "palignr    $0x8,%%xmm7,%%xmm7               \n"

-    "neg        %3                               \n"

-     // Second round of bit swap.

-    "punpcklwd  %%xmm2,%%xmm0                    \n"

-    "punpcklwd  %%xmm3,%%xmm1                    \n"

-    "movdqa     %%xmm0,%%xmm2                    \n"

-    "movdqa     %%xmm1,%%xmm3                    \n"

-    "palignr    $0x8,%%xmm2,%%xmm2               \n"

-    "palignr    $0x8,%%xmm3,%%xmm3               \n"

-    "punpcklwd  %%xmm6,%%xmm4                    \n"

-    "punpcklwd  %%xmm7,%%xmm5                    \n"

-    "movdqa     %%xmm4,%%xmm6                    \n"

-    "movdqa     %%xmm5,%%xmm7                    \n"

-    "palignr    $0x8,%%xmm6,%%xmm6               \n"

-    "palignr    $0x8,%%xmm7,%%xmm7               \n"

-    // Third round of bit swap.

-    // Write to the destination pointer.

-    "punpckldq  %%xmm4,%%xmm0                    \n"

-    "movq       %%xmm0,(%1)                      \n"

-    "movdqa     %%xmm0,%%xmm4                    \n"

-    "palignr    $0x8,%%xmm4,%%xmm4               \n"

-    "movq       %%xmm4,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "punpckldq  %%xmm6,%%xmm2                    \n"

-    "movdqa     %%xmm2,%%xmm6                    \n"

-    "movq       %%xmm2,(%1)                      \n"

-    "palignr    $0x8,%%xmm6,%%xmm6               \n"

-    "punpckldq  %%xmm5,%%xmm1                    \n"

-    "movq       %%xmm6,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "movdqa     %%xmm1,%%xmm5                    \n"

-    "movq       %%xmm1,(%1)                      \n"

-    "palignr    $0x8,%%xmm5,%%xmm5               \n"

-    "movq       %%xmm5,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "punpckldq  %%xmm7,%%xmm3                    \n"

-    "movq       %%xmm3,(%1)                      \n"

-    "movdqa     %%xmm3,%%xmm7                    \n"

-    "palignr    $0x8,%%xmm7,%%xmm7               \n"

-    "sub        $0x8,%2                          \n"

-    "movq       %%xmm7,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "jg         1b                               \n"

-    : "+r"(src),    // %0

-      "+r"(dst),    // %1

-      "+r"(width)   // %2

-    : "r"((intptr_t)(src_stride)),  // %3

-      "r"((intptr_t)(dst_stride))   // %4

-    : "memory", "cc",

-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+void TransposeWx8_SSSE3(const uint8_t* src,

+                        int src_stride,

+                        uint8_t* dst,

+                        int dst_stride,

+                        int width) {

+  asm volatile(

+      // Read in the data from the source pointer.

+      // First round of bit swap.

+      LABELALIGN

+      "1:                                          \n"

+      "movq       (%0),%%xmm0                      \n"

+      "movq       (%0,%3),%%xmm1                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "punpcklbw  %%xmm1,%%xmm0                    \n"

+      "movq       (%0),%%xmm2                      \n"

+      "movdqa     %%xmm0,%%xmm1                    \n"

+      "palignr    $0x8,%%xmm1,%%xmm1               \n"

+      "movq       (%0,%3),%%xmm3                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "punpcklbw  %%xmm3,%%xmm2                    \n"

+      "movdqa     %%xmm2,%%xmm3                    \n"

+      "movq       (%0),%%xmm4                      \n"

+      "palignr    $0x8,%%xmm3,%%xmm3               \n"

+      "movq       (%0,%3),%%xmm5                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "punpcklbw  %%xmm5,%%xmm4                    \n"

+      "movdqa     %%xmm4,%%xmm5                    \n"

+      "movq       (%0),%%xmm6                      \n"

+      "palignr    $0x8,%%xmm5,%%xmm5               \n"

+      "movq       (%0,%3),%%xmm7                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "punpcklbw  %%xmm7,%%xmm6                    \n"

+      "neg        %3                               \n"

+      "movdqa     %%xmm6,%%xmm7                    \n"

+      "lea        0x8(%0,%3,8),%0                  \n"

+      "palignr    $0x8,%%xmm7,%%xmm7               \n"

+      "neg        %3                               \n"

+      // Second round of bit swap.

+      "punpcklwd  %%xmm2,%%xmm0                    \n"

+      "punpcklwd  %%xmm3,%%xmm1                    \n"

+      "movdqa     %%xmm0,%%xmm2                    \n"

+      "movdqa     %%xmm1,%%xmm3                    \n"

+      "palignr    $0x8,%%xmm2,%%xmm2               \n"

+      "palignr    $0x8,%%xmm3,%%xmm3               \n"

+      "punpcklwd  %%xmm6,%%xmm4                    \n"

+      "punpcklwd  %%xmm7,%%xmm5                    \n"

+      "movdqa     %%xmm4,%%xmm6                    \n"

+      "movdqa     %%xmm5,%%xmm7                    \n"

+      "palignr    $0x8,%%xmm6,%%xmm6               \n"

+      "palignr    $0x8,%%xmm7,%%xmm7               \n"

+      // Third round of bit swap.

+      // Write to the destination pointer.

+      "punpckldq  %%xmm4,%%xmm0                    \n"

+      "movq       %%xmm0,(%1)                      \n"

+      "movdqa     %%xmm0,%%xmm4                    \n"

+      "palignr    $0x8,%%xmm4,%%xmm4               \n"

+      "movq       %%xmm4,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "punpckldq  %%xmm6,%%xmm2                    \n"

+      "movdqa     %%xmm2,%%xmm6                    \n"

+      "movq       %%xmm2,(%1)                      \n"

+      "palignr    $0x8,%%xmm6,%%xmm6               \n"

+      "punpckldq  %%xmm5,%%xmm1                    \n"

+      "movq       %%xmm6,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "movdqa     %%xmm1,%%xmm5                    \n"

+      "movq       %%xmm1,(%1)                      \n"

+      "palignr    $0x8,%%xmm5,%%xmm5               \n"

+      "movq       %%xmm5,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "punpckldq  %%xmm7,%%xmm3                    \n"

+      "movq       %%xmm3,(%1)                      \n"

+      "movdqa     %%xmm3,%%xmm7                    \n"

+      "palignr    $0x8,%%xmm7,%%xmm7               \n"

+      "sub        $0x8,%2                          \n"

+      "movq       %%xmm7,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "jg         1b                               \n"

+      : "+r"(src),                    // %0

+        "+r"(dst),                    // %1

+        "+r"(width)                   // %2

+      : "r"((intptr_t)(src_stride)),  // %3

+        "r"((intptr_t)(dst_stride))   // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // defined(HAS_TRANSPOSEWX8_SSSE3)

 // Transpose 16x8. 64 bit

 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)

-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,

-                             uint8* dst, int dst_stride, int width) {

-  asm volatile (

-    // Read in the data from the source pointer.

-    // First round of bit swap.

-    LABELALIGN

-  "1:                                            \n"

-    "movdqu     (%0),%%xmm0                      \n"

-    "movdqu     (%0,%3),%%xmm1                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "movdqa     %%xmm0,%%xmm8                    \n"

-    "punpcklbw  %%xmm1,%%xmm0                    \n"

-    "punpckhbw  %%xmm1,%%xmm8                    \n"

-    "movdqu     (%0),%%xmm2                      \n"

-    "movdqa     %%xmm0,%%xmm1                    \n"

-    "movdqa     %%xmm8,%%xmm9                    \n"

-    "palignr    $0x8,%%xmm1,%%xmm1               \n"

-    "palignr    $0x8,%%xmm9,%%xmm9               \n"

-    "movdqu     (%0,%3),%%xmm3                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "movdqa     %%xmm2,%%xmm10                   \n"

-    "punpcklbw  %%xmm3,%%xmm2                    \n"

-    "punpckhbw  %%xmm3,%%xmm10                   \n"

-    "movdqa     %%xmm2,%%xmm3                    \n"

-    "movdqa     %%xmm10,%%xmm11                  \n"

-    "movdqu     (%0),%%xmm4                      \n"

-    "palignr    $0x8,%%xmm3,%%xmm3               \n"

-    "palignr    $0x8,%%xmm11,%%xmm11             \n"

-    "movdqu     (%0,%3),%%xmm5                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "movdqa     %%xmm4,%%xmm12                   \n"

-    "punpcklbw  %%xmm5,%%xmm4                    \n"

-    "punpckhbw  %%xmm5,%%xmm12                   \n"

-    "movdqa     %%xmm4,%%xmm5                    \n"

-    "movdqa     %%xmm12,%%xmm13                  \n"

-    "movdqu     (%0),%%xmm6                      \n"

-    "palignr    $0x8,%%xmm5,%%xmm5               \n"

-    "palignr    $0x8,%%xmm13,%%xmm13             \n"

-    "movdqu     (%0,%3),%%xmm7                   \n"

-    "lea        (%0,%3,2),%0                     \n"

-    "movdqa     %%xmm6,%%xmm14                   \n"

-    "punpcklbw  %%xmm7,%%xmm6                    \n"

-    "punpckhbw  %%xmm7,%%xmm14                   \n"

-    "neg        %3                               \n"

-    "movdqa     %%xmm6,%%xmm7                    \n"

-    "movdqa     %%xmm14,%%xmm15                  \n"

-    "lea        0x10(%0,%3,8),%0                 \n"

-    "palignr    $0x8,%%xmm7,%%xmm7               \n"

-    "palignr    $0x8,%%xmm15,%%xmm15             \n"

-    "neg        %3                               \n"

-     // Second round of bit swap.

-    "punpcklwd  %%xmm2,%%xmm0                    \n"

-    "punpcklwd  %%xmm3,%%xmm1                    \n"

-    "movdqa     %%xmm0,%%xmm2                    \n"

-    "movdqa     %%xmm1,%%xmm3                    \n"

-    "palignr    $0x8,%%xmm2,%%xmm2               \n"

-    "palignr    $0x8,%%xmm3,%%xmm3               \n"

-    "punpcklwd  %%xmm6,%%xmm4                    \n"

-    "punpcklwd  %%xmm7,%%xmm5                    \n"

-    "movdqa     %%xmm4,%%xmm6                    \n"

-    "movdqa     %%xmm5,%%xmm7                    \n"

-    "palignr    $0x8,%%xmm6,%%xmm6               \n"

-    "palignr    $0x8,%%xmm7,%%xmm7               \n"

-    "punpcklwd  %%xmm10,%%xmm8                   \n"

-    "punpcklwd  %%xmm11,%%xmm9                   \n"

-    "movdqa     %%xmm8,%%xmm10                   \n"

-    "movdqa     %%xmm9,%%xmm11                   \n"

-    "palignr    $0x8,%%xmm10,%%xmm10             \n"

-    "palignr    $0x8,%%xmm11,%%xmm11             \n"

-    "punpcklwd  %%xmm14,%%xmm12                  \n"

-    "punpcklwd  %%xmm15,%%xmm13                  \n"

-    "movdqa     %%xmm12,%%xmm14                  \n"

-    "movdqa     %%xmm13,%%xmm15                  \n"

-    "palignr    $0x8,%%xmm14,%%xmm14             \n"

-    "palignr    $0x8,%%xmm15,%%xmm15             \n"

-    // Third round of bit swap.

-    // Write to the destination pointer.

-    "punpckldq  %%xmm4,%%xmm0                    \n"

-    "movq       %%xmm0,(%1)                      \n"

-    "movdqa     %%xmm0,%%xmm4                    \n"

-    "palignr    $0x8,%%xmm4,%%xmm4               \n"

-    "movq       %%xmm4,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "punpckldq  %%xmm6,%%xmm2                    \n"

-    "movdqa     %%xmm2,%%xmm6                    \n"

-    "movq       %%xmm2,(%1)                      \n"

-    "palignr    $0x8,%%xmm6,%%xmm6               \n"

-    "punpckldq  %%xmm5,%%xmm1                    \n"

-    "movq       %%xmm6,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "movdqa     %%xmm1,%%xmm5                    \n"

-    "movq       %%xmm1,(%1)                      \n"

-    "palignr    $0x8,%%xmm5,%%xmm5               \n"

-    "movq       %%xmm5,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "punpckldq  %%xmm7,%%xmm3                    \n"

-    "movq       %%xmm3,(%1)                      \n"

-    "movdqa     %%xmm3,%%xmm7                    \n"

-    "palignr    $0x8,%%xmm7,%%xmm7               \n"

-    "movq       %%xmm7,(%1,%4)                   \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "punpckldq  %%xmm12,%%xmm8                   \n"

-    "movq       %%xmm8,(%1)                      \n"

-    "movdqa     %%xmm8,%%xmm12                   \n"

-    "palignr    $0x8,%%xmm12,%%xmm12             \n"

-    "movq       %%xmm12,(%1,%4)                  \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "punpckldq  %%xmm14,%%xmm10                  \n"

-    "movdqa     %%xmm10,%%xmm14                  \n"

-    "movq       %%xmm10,(%1)                     \n"

-    "palignr    $0x8,%%xmm14,%%xmm14             \n"

-    "punpckldq  %%xmm13,%%xmm9                   \n"

-    "movq       %%xmm14,(%1,%4)                  \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "movdqa     %%xmm9,%%xmm13                   \n"

-    "movq       %%xmm9,(%1)                      \n"

-    "palignr    $0x8,%%xmm13,%%xmm13             \n"

-    "movq       %%xmm13,(%1,%4)                  \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "punpckldq  %%xmm15,%%xmm11                  \n"

-    "movq       %%xmm11,(%1)                     \n"

-    "movdqa     %%xmm11,%%xmm15                  \n"

-    "palignr    $0x8,%%xmm15,%%xmm15             \n"

-    "sub        $0x10,%2                         \n"

-    "movq       %%xmm15,(%1,%4)                  \n"

-    "lea        (%1,%4,2),%1                     \n"

-    "jg         1b                               \n"

-    : "+r"(src),    // %0

-      "+r"(dst),    // %1

-      "+r"(width)   // %2

-    : "r"((intptr_t)(src_stride)),  // %3

-      "r"((intptr_t)(dst_stride))   // %4

-    : "memory", "cc",

-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

-      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"

-  );

+void TransposeWx8_Fast_SSSE3(const uint8_t* src,

+                             int src_stride,

+                             uint8_t* dst,

+                             int dst_stride,

+                             int width) {

+  asm volatile(

+      // Read in the data from the source pointer.

+      // First round of bit swap.

+      LABELALIGN

+      "1:                                          \n"

+      "movdqu     (%0),%%xmm0                      \n"

+      "movdqu     (%0,%3),%%xmm1                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "movdqa     %%xmm0,%%xmm8                    \n"

+      "punpcklbw  %%xmm1,%%xmm0                    \n"

+      "punpckhbw  %%xmm1,%%xmm8                    \n"

+      "movdqu     (%0),%%xmm2                      \n"

+      "movdqa     %%xmm0,%%xmm1                    \n"

+      "movdqa     %%xmm8,%%xmm9                    \n"

+      "palignr    $0x8,%%xmm1,%%xmm1               \n"

+      "palignr    $0x8,%%xmm9,%%xmm9               \n"

+      "movdqu     (%0,%3),%%xmm3                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "movdqa     %%xmm2,%%xmm10                   \n"

+      "punpcklbw  %%xmm3,%%xmm2                    \n"

+      "punpckhbw  %%xmm3,%%xmm10                   \n"

+      "movdqa     %%xmm2,%%xmm3                    \n"

+      "movdqa     %%xmm10,%%xmm11                  \n"

+      "movdqu     (%0),%%xmm4                      \n"

+      "palignr    $0x8,%%xmm3,%%xmm3               \n"

+      "palignr    $0x8,%%xmm11,%%xmm11             \n"

+      "movdqu     (%0,%3),%%xmm5                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "movdqa     %%xmm4,%%xmm12                   \n"

+      "punpcklbw  %%xmm5,%%xmm4                    \n"

+      "punpckhbw  %%xmm5,%%xmm12                   \n"

+      "movdqa     %%xmm4,%%xmm5                    \n"

+      "movdqa     %%xmm12,%%xmm13                  \n"

+      "movdqu     (%0),%%xmm6                      \n"

+      "palignr    $0x8,%%xmm5,%%xmm5               \n"

+      "palignr    $0x8,%%xmm13,%%xmm13             \n"

+      "movdqu     (%0,%3),%%xmm7                   \n"

+      "lea        (%0,%3,2),%0                     \n"

+      "movdqa     %%xmm6,%%xmm14                   \n"

+      "punpcklbw  %%xmm7,%%xmm6                    \n"

+      "punpckhbw  %%xmm7,%%xmm14                   \n"

+      "neg        %3                               \n"

+      "movdqa     %%xmm6,%%xmm7                    \n"

+      "movdqa     %%xmm14,%%xmm15                  \n"

+      "lea        0x10(%0,%3,8),%0                 \n"

+      "palignr    $0x8,%%xmm7,%%xmm7               \n"

+      "palignr    $0x8,%%xmm15,%%xmm15             \n"

+      "neg        %3                               \n"

+      // Second round of bit swap.

+      "punpcklwd  %%xmm2,%%xmm0                    \n"

+      "punpcklwd  %%xmm3,%%xmm1                    \n"

+      "movdqa     %%xmm0,%%xmm2                    \n"

+      "movdqa     %%xmm1,%%xmm3                    \n"

+      "palignr    $0x8,%%xmm2,%%xmm2               \n"

+      "palignr    $0x8,%%xmm3,%%xmm3               \n"

+      "punpcklwd  %%xmm6,%%xmm4                    \n"

+      "punpcklwd  %%xmm7,%%xmm5                    \n"

+      "movdqa     %%xmm4,%%xmm6                    \n"

+      "movdqa     %%xmm5,%%xmm7                    \n"

+      "palignr    $0x8,%%xmm6,%%xmm6               \n"

+      "palignr    $0x8,%%xmm7,%%xmm7               \n"

+      "punpcklwd  %%xmm10,%%xmm8                   \n"

+      "punpcklwd  %%xmm11,%%xmm9                   \n"

+      "movdqa     %%xmm8,%%xmm10                   \n"

+      "movdqa     %%xmm9,%%xmm11                   \n"

+      "palignr    $0x8,%%xmm10,%%xmm10             \n"

+      "palignr    $0x8,%%xmm11,%%xmm11             \n"

+      "punpcklwd  %%xmm14,%%xmm12                  \n"

+      "punpcklwd  %%xmm15,%%xmm13                  \n"

+      "movdqa     %%xmm12,%%xmm14                  \n"

+      "movdqa     %%xmm13,%%xmm15                  \n"

+      "palignr    $0x8,%%xmm14,%%xmm14             \n"

+      "palignr    $0x8,%%xmm15,%%xmm15             \n"

+      // Third round of bit swap.

+      // Write to the destination pointer.

+      "punpckldq  %%xmm4,%%xmm0                    \n"

+      "movq       %%xmm0,(%1)                      \n"

+      "movdqa     %%xmm0,%%xmm4                    \n"

+      "palignr    $0x8,%%xmm4,%%xmm4               \n"

+      "movq       %%xmm4,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "punpckldq  %%xmm6,%%xmm2                    \n"

+      "movdqa     %%xmm2,%%xmm6                    \n"

+      "movq       %%xmm2,(%1)                      \n"

+      "palignr    $0x8,%%xmm6,%%xmm6               \n"

+      "punpckldq  %%xmm5,%%xmm1                    \n"

+      "movq       %%xmm6,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "movdqa     %%xmm1,%%xmm5                    \n"

+      "movq       %%xmm1,(%1)                      \n"

+      "palignr    $0x8,%%xmm5,%%xmm5               \n"

+      "movq       %%xmm5,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "punpckldq  %%xmm7,%%xmm3                    \n"

+      "movq       %%xmm3,(%1)                      \n"

+      "movdqa     %%xmm3,%%xmm7                    \n"

+      "palignr    $0x8,%%xmm7,%%xmm7               \n"

+      "movq       %%xmm7,(%1,%4)                   \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "punpckldq  %%xmm12,%%xmm8                   \n"

+      "movq       %%xmm8,(%1)                      \n"

+      "movdqa     %%xmm8,%%xmm12                   \n"

+      "palignr    $0x8,%%xmm12,%%xmm12             \n"

+      "movq       %%xmm12,(%1,%4)                  \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "punpckldq  %%xmm14,%%xmm10                  \n"

+      "movdqa     %%xmm10,%%xmm14                  \n"

+      "movq       %%xmm10,(%1)                     \n"

+      "palignr    $0x8,%%xmm14,%%xmm14             \n"

+      "punpckldq  %%xmm13,%%xmm9                   \n"

+      "movq       %%xmm14,(%1,%4)                  \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "movdqa     %%xmm9,%%xmm13                   \n"

+      "movq       %%xmm9,(%1)                      \n"

+      "palignr    $0x8,%%xmm13,%%xmm13             \n"

+      "movq       %%xmm13,(%1,%4)                  \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "punpckldq  %%xmm15,%%xmm11                  \n"

+      "movq       %%xmm11,(%1)                     \n"

+      "movdqa     %%xmm11,%%xmm15                  \n"

+      "palignr    $0x8,%%xmm15,%%xmm15             \n"

+      "sub        $0x10,%2                         \n"

+      "movq       %%xmm15,(%1,%4)                  \n"

+      "lea        (%1,%4,2),%1                     \n"

+      "jg         1b                               \n"

+      : "+r"(src),                    // %0

+        "+r"(dst),                    // %1

+        "+r"(width)                   // %2

+      : "r"((intptr_t)(src_stride)),  // %3

+        "r"((intptr_t)(dst_stride))   // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

+        "xmm15");

 #endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)

 // Transpose UV 8x8.  64 bit.

 #if defined(HAS_TRANSPOSEUVWX8_SSE2)

-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b, int width) {

-  asm volatile (

-    // Read in the data from the source pointer.

-    // First round of bit swap.

-    LABELALIGN

-  "1:                                            \n"

-    "movdqu     (%0),%%xmm0                      \n"

-    "movdqu     (%0,%4),%%xmm1                   \n"

-    "lea        (%0,%4,2),%0                     \n"

-    "movdqa     %%xmm0,%%xmm8                    \n"

-    "punpcklbw  %%xmm1,%%xmm0                    \n"

-    "punpckhbw  %%xmm1,%%xmm8                    \n"

-    "movdqa     %%xmm8,%%xmm1                    \n"

-    "movdqu     (%0),%%xmm2                      \n"

-    "movdqu     (%0,%4),%%xmm3                   \n"

-    "lea        (%0,%4,2),%0                     \n"

-    "movdqa     %%xmm2,%%xmm8                    \n"

-    "punpcklbw  %%xmm3,%%xmm2                    \n"

-    "punpckhbw  %%xmm3,%%xmm8                    \n"

-    "movdqa     %%xmm8,%%xmm3                    \n"

-    "movdqu     (%0),%%xmm4                      \n"

-    "movdqu     (%0,%4),%%xmm5                   \n"

-    "lea        (%0,%4,2),%0                     \n"

-    "movdqa     %%xmm4,%%xmm8                    \n"

-    "punpcklbw  %%xmm5,%%xmm4                    \n"

-    "punpckhbw  %%xmm5,%%xmm8                    \n"

-    "movdqa     %%xmm8,%%xmm5                    \n"

-    "movdqu     (%0),%%xmm6                      \n"

-    "movdqu     (%0,%4),%%xmm7                   \n"

-    "lea        (%0,%4,2),%0                     \n"

-    "movdqa     %%xmm6,%%xmm8                    \n"

-    "punpcklbw  %%xmm7,%%xmm6                    \n"

-    "neg        %4                               \n"

-    "lea        0x10(%0,%4,8),%0                 \n"

-    "punpckhbw  %%xmm7,%%xmm8                    \n"

-    "movdqa     %%xmm8,%%xmm7                    \n"

-    "neg        %4                               \n"

-     // Second round of bit swap.

-    "movdqa     %%xmm0,%%xmm8                    \n"

-    "movdqa     %%xmm1,%%xmm9                    \n"

-    "punpckhwd  %%xmm2,%%xmm8                    \n"

-    "punpckhwd  %%xmm3,%%xmm9                    \n"

-    "punpcklwd  %%xmm2,%%xmm0                    \n"

-    "punpcklwd  %%xmm3,%%xmm1                    \n"

-    "movdqa     %%xmm8,%%xmm2                    \n"

-    "movdqa     %%xmm9,%%xmm3                    \n"

-    "movdqa     %%xmm4,%%xmm8                    \n"

-    "movdqa     %%xmm5,%%xmm9                    \n"

-    "punpckhwd  %%xmm6,%%xmm8                    \n"

-    "punpckhwd  %%xmm7,%%xmm9                    \n"

-    "punpcklwd  %%xmm6,%%xmm4                    \n"

-    "punpcklwd  %%xmm7,%%xmm5                    \n"

-    "movdqa     %%xmm8,%%xmm6                    \n"

-    "movdqa     %%xmm9,%%xmm7                    \n"

-    // Third round of bit swap.

-    // Write to the destination pointer.

-    "movdqa     %%xmm0,%%xmm8                    \n"

-    "punpckldq  %%xmm4,%%xmm0                    \n"

-    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel

-    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel

-    "punpckhdq  %%xmm4,%%xmm8                    \n"

-    "movlpd     %%xmm8,(%1,%5)                   \n"

-    "lea        (%1,%5,2),%1                     \n"

-    "movhpd     %%xmm8,(%2,%6)                   \n"

-    "lea        (%2,%6,2),%2                     \n"

-    "movdqa     %%xmm2,%%xmm8                    \n"

-    "punpckldq  %%xmm6,%%xmm2                    \n"

-    "movlpd     %%xmm2,(%1)                      \n"

-    "movhpd     %%xmm2,(%2)                      \n"

-    "punpckhdq  %%xmm6,%%xmm8                    \n"

-    "movlpd     %%xmm8,(%1,%5)                   \n"

-    "lea        (%1,%5,2),%1                     \n"

-    "movhpd     %%xmm8,(%2,%6)                   \n"

-    "lea        (%2,%6,2),%2                     \n"

-    "movdqa     %%xmm1,%%xmm8                    \n"

-    "punpckldq  %%xmm5,%%xmm1                    \n"

-    "movlpd     %%xmm1,(%1)                      \n"

-    "movhpd     %%xmm1,(%2)                      \n"

-    "punpckhdq  %%xmm5,%%xmm8                    \n"

-    "movlpd     %%xmm8,(%1,%5)                   \n"

-    "lea        (%1,%5,2),%1                     \n"

-    "movhpd     %%xmm8,(%2,%6)                   \n"

-    "lea        (%2,%6,2),%2                     \n"

-    "movdqa     %%xmm3,%%xmm8                    \n"

-    "punpckldq  %%xmm7,%%xmm3                    \n"

-    "movlpd     %%xmm3,(%1)                      \n"

-    "movhpd     %%xmm3,(%2)                      \n"

-    "punpckhdq  %%xmm7,%%xmm8                    \n"

-    "sub        $0x8,%3                          \n"

-    "movlpd     %%xmm8,(%1,%5)                   \n"

-    "lea        (%1,%5,2),%1                     \n"

-    "movhpd     %%xmm8,(%2,%6)                   \n"

-    "lea        (%2,%6,2),%2                     \n"

-    "jg         1b                               \n"

-    : "+r"(src),    // %0

-      "+r"(dst_a),  // %1

-      "+r"(dst_b),  // %2

-      "+r"(width)   // %3

-    : "r"((intptr_t)(src_stride)),    // %4

-      "r"((intptr_t)(dst_stride_a)),  // %5

-      "r"((intptr_t)(dst_stride_b))   // %6

-    : "memory", "cc",

-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",

-      "xmm8", "xmm9"

-  );

+void TransposeUVWx8_SSE2(const uint8_t* src,

+                         int src_stride,

+                         uint8_t* dst_a,

+                         int dst_stride_a,

+                         uint8_t* dst_b,

+                         int dst_stride_b,

+                         int width) {

+  asm volatile(

+      // Read in the data from the source pointer.

+      // First round of bit swap.

+      LABELALIGN

+      "1:                                          \n"

+      "movdqu     (%0),%%xmm0                      \n"

+      "movdqu     (%0,%4),%%xmm1                   \n"

+      "lea        (%0,%4,2),%0                     \n"

+      "movdqa     %%xmm0,%%xmm8                    \n"

+      "punpcklbw  %%xmm1,%%xmm0                    \n"

+      "punpckhbw  %%xmm1,%%xmm8                    \n"

+      "movdqa     %%xmm8,%%xmm1                    \n"

+      "movdqu     (%0),%%xmm2                      \n"

+      "movdqu     (%0,%4),%%xmm3                   \n"

+      "lea        (%0,%4,2),%0                     \n"

+      "movdqa     %%xmm2,%%xmm8                    \n"

+      "punpcklbw  %%xmm3,%%xmm2                    \n"

+      "punpckhbw  %%xmm3,%%xmm8                    \n"

+      "movdqa     %%xmm8,%%xmm3                    \n"

+      "movdqu     (%0),%%xmm4                      \n"

+      "movdqu     (%0,%4),%%xmm5                   \n"

+      "lea        (%0,%4,2),%0                     \n"

+      "movdqa     %%xmm4,%%xmm8                    \n"

+      "punpcklbw  %%xmm5,%%xmm4                    \n"

+      "punpckhbw  %%xmm5,%%xmm8                    \n"

+      "movdqa     %%xmm8,%%xmm5                    \n"

+      "movdqu     (%0),%%xmm6                      \n"

+      "movdqu     (%0,%4),%%xmm7                   \n"

+      "lea        (%0,%4,2),%0                     \n"

+      "movdqa     %%xmm6,%%xmm8                    \n"

+      "punpcklbw  %%xmm7,%%xmm6                    \n"

+      "neg        %4                               \n"

+      "lea        0x10(%0,%4,8),%0                 \n"

+      "punpckhbw  %%xmm7,%%xmm8                    \n"

+      "movdqa     %%xmm8,%%xmm7                    \n"

+      "neg        %4                               \n"

+      // Second round of bit swap.

+      "movdqa     %%xmm0,%%xmm8                    \n"

+      "movdqa     %%xmm1,%%xmm9                    \n"

+      "punpckhwd  %%xmm2,%%xmm8                    \n"

+      "punpckhwd  %%xmm3,%%xmm9                    \n"

+      "punpcklwd  %%xmm2,%%xmm0                    \n"

+      "punpcklwd  %%xmm3,%%xmm1                    \n"

+      "movdqa     %%xmm8,%%xmm2                    \n"

+      "movdqa     %%xmm9,%%xmm3                    \n"

+      "movdqa     %%xmm4,%%xmm8                    \n"

+      "movdqa     %%xmm5,%%xmm9                    \n"

+      "punpckhwd  %%xmm6,%%xmm8                    \n"

+      "punpckhwd  %%xmm7,%%xmm9                    \n"

+      "punpcklwd  %%xmm6,%%xmm4                    \n"

+      "punpcklwd  %%xmm7,%%xmm5                    \n"

+      "movdqa     %%xmm8,%%xmm6                    \n"

+      "movdqa     %%xmm9,%%xmm7                    \n"

+      // Third round of bit swap.

+      // Write to the destination pointer.

+      "movdqa     %%xmm0,%%xmm8                    \n"

+      "punpckldq  %%xmm4,%%xmm0                    \n"

+      "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel

+      "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel

+      "punpckhdq  %%xmm4,%%xmm8                    \n"

+      "movlpd     %%xmm8,(%1,%5)                   \n"

+      "lea        (%1,%5,2),%1                     \n"

+      "movhpd     %%xmm8,(%2,%6)                   \n"

+      "lea        (%2,%6,2),%2                     \n"

+      "movdqa     %%xmm2,%%xmm8                    \n"

+      "punpckldq  %%xmm6,%%xmm2                    \n"

+      "movlpd     %%xmm2,(%1)                      \n"

+      "movhpd     %%xmm2,(%2)                      \n"

+      "punpckhdq  %%xmm6,%%xmm8                    \n"

+      "movlpd     %%xmm8,(%1,%5)                   \n"

+      "lea        (%1,%5,2),%1                     \n"

+      "movhpd     %%xmm8,(%2,%6)                   \n"

+      "lea        (%2,%6,2),%2                     \n"

+      "movdqa     %%xmm1,%%xmm8                    \n"

+      "punpckldq  %%xmm5,%%xmm1                    \n"

+      "movlpd     %%xmm1,(%1)                      \n"

+      "movhpd     %%xmm1,(%2)                      \n"

+      "punpckhdq  %%xmm5,%%xmm8                    \n"

+      "movlpd     %%xmm8,(%1,%5)                   \n"

+      "lea        (%1,%5,2),%1                     \n"

+      "movhpd     %%xmm8,(%2,%6)                   \n"

+      "lea        (%2,%6,2),%2                     \n"

+      "movdqa     %%xmm3,%%xmm8                    \n"

+      "punpckldq  %%xmm7,%%xmm3                    \n"

+      "movlpd     %%xmm3,(%1)                      \n"

+      "movhpd     %%xmm3,(%2)                      \n"

+      "punpckhdq  %%xmm7,%%xmm8                    \n"

+      "sub        $0x8,%3                          \n"

+      "movlpd     %%xmm8,(%1,%5)                   \n"

+      "lea        (%1,%5,2),%1                     \n"

+      "movhpd     %%xmm8,(%2,%6)                   \n"

+      "lea        (%2,%6,2),%2                     \n"

+      "jg         1b                               \n"

+      : "+r"(src),                      // %0

+        "+r"(dst_a),                    // %1

+        "+r"(dst_b),                    // %2

+        "+r"(width)                     // %3

+      : "r"((intptr_t)(src_stride)),    // %4

+        "r"((intptr_t)(dst_stride_a)),  // %5

+        "r"((intptr_t)(dst_stride_b))   // %6

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7", "xmm8", "xmm9");

 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)

 #endif  // defined(__x86_64__) || defined(__i386__)

--- a/third_party/libyuv/source/rotate_mips.cc

+++ /dev/null

@@ -1,484 +1,0 @@

-/*

- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS. All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "libyuv/row.h"

-#include "libyuv/rotate_row.h"

-#include "libyuv/basic_types.h"

-#ifdef __cplusplus

-namespace libyuv {

-extern "C" {

-#endif

-#if !defined(LIBYUV_DISABLE_MIPS) && \

-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \

-    (_MIPS_SIM == _MIPS_SIM_ABI32)

-void TransposeWx8_DSPR2(const uint8* src, int src_stride,

-                        uint8* dst, int dst_stride, int width) {

-   __asm__ __volatile__ (

-      ".set push                                         \n"

-      ".set noreorder                                    \n"

-      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2

-      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4

-      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8

-      "addu             $t3, $t2, %[src_stride]          \n"

-      "addu             $t5, $t4, %[src_stride]          \n"

-      "addu             $t6, $t2, $t4                    \n"

-      "andi             $t0, %[dst], 0x3                 \n"

-      "andi             $t1, %[dst_stride], 0x3          \n"

-      "or               $t0, $t0, $t1                    \n"

-      "bnez             $t0, 11f                         \n"

-      " subu            $t7, $t9, %[src_stride]          \n"

-//dst + dst_stride word aligned

-    "1:                                                  \n"

-      "lbu              $t0, 0(%[src])                   \n"

-      "lbux             $t1, %[src_stride](%[src])       \n"

-      "lbux             $t8, $t2(%[src])                 \n"

-      "lbux             $t9, $t3(%[src])                 \n"

-      "sll              $t1, $t1, 16                     \n"

-      "sll              $t9, $t9, 16                     \n"

-      "or               $t0, $t0, $t1                    \n"

-      "or               $t8, $t8, $t9                    \n"

-      "precr.qb.ph      $s0, $t8, $t0                    \n"

-      "lbux             $t0, $t4(%[src])                 \n"

-      "lbux             $t1, $t5(%[src])                 \n"

-      "lbux             $t8, $t6(%[src])                 \n"

-      "lbux             $t9, $t7(%[src])                 \n"

-      "sll              $t1, $t1, 16                     \n"

-      "sll              $t9, $t9, 16                     \n"

-      "or               $t0, $t0, $t1                    \n"

-      "or               $t8, $t8, $t9                    \n"

-      "precr.qb.ph      $s1, $t8, $t0                    \n"

-      "sw               $s0, 0(%[dst])                   \n"

-      "addiu            %[width], -1                     \n"

-      "addiu            %[src], 1                        \n"

-      "sw               $s1, 4(%[dst])                   \n"

-      "bnez             %[width], 1b                     \n"

-      " addu            %[dst], %[dst], %[dst_stride]    \n"

-      "b                2f                               \n"

-//dst + dst_stride unaligned

-   "11:                                                  \n"

-      "lbu              $t0, 0(%[src])                   \n"

-      "lbux             $t1, %[src_stride](%[src])       \n"

-      "lbux             $t8, $t2(%[src])                 \n"

-      "lbux             $t9, $t3(%[src])                 \n"

-      "sll              $t1, $t1, 16                     \n"

-      "sll              $t9, $t9, 16                     \n"

-      "or               $t0, $t0, $t1                    \n"

-      "or               $t8, $t8, $t9                    \n"

-      "precr.qb.ph      $s0, $t8, $t0                    \n"

-      "lbux             $t0, $t4(%[src])                 \n"

-      "lbux             $t1, $t5(%[src])                 \n"

-      "lbux             $t8, $t6(%[src])                 \n"

-      "lbux             $t9, $t7(%[src])                 \n"

-      "sll              $t1, $t1, 16                     \n"

-      "sll              $t9, $t9, 16                     \n"

-      "or               $t0, $t0, $t1                    \n"

-      "or               $t8, $t8, $t9                    \n"

-      "precr.qb.ph      $s1, $t8, $t0                    \n"

-      "swr              $s0, 0(%[dst])                   \n"

-      "swl              $s0, 3(%[dst])                   \n"

-      "addiu            %[width], -1                     \n"

-      "addiu            %[src], 1                        \n"

-      "swr              $s1, 4(%[dst])                   \n"

-      "swl              $s1, 7(%[dst])                   \n"

-      "bnez             %[width], 11b                    \n"

-       "addu             %[dst], %[dst], %[dst_stride]   \n"

-    "2:                                                  \n"

-      ".set pop                                          \n"

-      :[src] "+r" (src),

-       [dst] "+r" (dst),

-       [width] "+r" (width)

-      :[src_stride] "r" (src_stride),

-       [dst_stride] "r" (dst_stride)

-      : "t0", "t1",  "t2", "t3", "t4", "t5",

-        "t6", "t7", "t8", "t9",

-        "s0", "s1"

-  );

-}

-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,

-                             uint8* dst, int dst_stride, int width) {

-  __asm__ __volatile__ (

-      ".set noat                                         \n"

-      ".set push                                         \n"

-      ".set noreorder                                    \n"

-      "beqz             %[width], 2f                     \n"

-      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2

-      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4

-      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8

-      "addu             $t3, $t2, %[src_stride]          \n"

-      "addu             $t5, $t4, %[src_stride]          \n"

-      "addu             $t6, $t2, $t4                    \n"

-      "srl              $AT, %[width], 0x2               \n"

-      "andi             $t0, %[dst], 0x3                 \n"

-      "andi             $t1, %[dst_stride], 0x3          \n"

-      "or               $t0, $t0, $t1                    \n"

-      "bnez             $t0, 11f                         \n"

-      " subu            $t7, $t9, %[src_stride]          \n"

-//dst + dst_stride word aligned

-      "1:                                                \n"

-      "lw               $t0, 0(%[src])                   \n"

-      "lwx              $t1, %[src_stride](%[src])       \n"

-      "lwx              $t8, $t2(%[src])                 \n"

-      "lwx              $t9, $t3(%[src])                 \n"

-// t0 = | 30 | 20 | 10 | 00 |

-// t1 = | 31 | 21 | 11 | 01 |

-// t8 = | 32 | 22 | 12 | 02 |

-// t9 = | 33 | 23 | 13 | 03 |

-      "precr.qb.ph     $s0, $t1, $t0                     \n"

-      "precr.qb.ph     $s1, $t9, $t8                     \n"

-      "precrq.qb.ph    $s2, $t1, $t0                     \n"

-      "precrq.qb.ph    $s3, $t9, $t8                     \n"

-  // s0 = | 21 | 01 | 20 | 00 |

-  // s1 = | 23 | 03 | 22 | 02 |

-  // s2 = | 31 | 11 | 30 | 10 |

-  // s3 = | 33 | 13 | 32 | 12 |

-      "precr.qb.ph     $s4, $s1, $s0                     \n"

-      "precrq.qb.ph    $s5, $s1, $s0                     \n"

-      "precr.qb.ph     $s6, $s3, $s2                     \n"

-      "precrq.qb.ph    $s7, $s3, $s2                     \n"

-  // s4 = | 03 | 02 | 01 | 00 |

-  // s5 = | 23 | 22 | 21 | 20 |

-  // s6 = | 13 | 12 | 11 | 10 |

-  // s7 = | 33 | 32 | 31 | 30 |

-      "lwx              $t0, $t4(%[src])                 \n"

-      "lwx              $t1, $t5(%[src])                 \n"

-      "lwx              $t8, $t6(%[src])                 \n"

-      "lwx              $t9, $t7(%[src])                 \n"

-// t0 = | 34 | 24 | 14 | 04 |

-// t1 = | 35 | 25 | 15 | 05 |

-// t8 = | 36 | 26 | 16 | 06 |

-// t9 = | 37 | 27 | 17 | 07 |

-      "precr.qb.ph     $s0, $t1, $t0                     \n"

-      "precr.qb.ph     $s1, $t9, $t8                     \n"

-      "precrq.qb.ph    $s2, $t1, $t0                     \n"

-      "precrq.qb.ph    $s3, $t9, $t8                     \n"

-  // s0 = | 25 | 05 | 24 | 04 |

-  // s1 = | 27 | 07 | 26 | 06 |

-  // s2 = | 35 | 15 | 34 | 14 |

-  // s3 = | 37 | 17 | 36 | 16 |

-      "precr.qb.ph     $t0, $s1, $s0                     \n"

-      "precrq.qb.ph    $t1, $s1, $s0                     \n"

-      "precr.qb.ph     $t8, $s3, $s2                     \n"

-      "precrq.qb.ph    $t9, $s3, $s2                     \n"

-  // t0 = | 07 | 06 | 05 | 04 |

-  // t1 = | 27 | 26 | 25 | 24 |

-  // t8 = | 17 | 16 | 15 | 14 |

-  // t9 = | 37 | 36 | 35 | 34 |

-      "addu            $s0, %[dst], %[dst_stride]        \n"

-      "addu            $s1, $s0, %[dst_stride]           \n"

-      "addu            $s2, $s1, %[dst_stride]           \n"

-      "sw              $s4, 0(%[dst])                    \n"

-      "sw              $t0, 4(%[dst])                    \n"

-      "sw              $s6, 0($s0)                       \n"

-      "sw              $t8, 4($s0)                       \n"

-      "sw              $s5, 0($s1)                       \n"

-      "sw              $t1, 4($s1)                       \n"

-      "sw              $s7, 0($s2)                       \n"

-      "sw              $t9, 4($s2)                       \n"

-      "addiu            $AT, -1                          \n"

-      "addiu            %[src], 4                        \n"

-      "bnez             $AT, 1b                          \n"

-      " addu            %[dst], $s2, %[dst_stride]       \n"

-      "b                2f                               \n"

-//dst + dst_stride unaligned

-      "11:                                               \n"

-      "lw               $t0, 0(%[src])                   \n"

-      "lwx              $t1, %[src_stride](%[src])       \n"

-      "lwx              $t8, $t2(%[src])                 \n"

-      "lwx              $t9, $t3(%[src])                 \n"

-// t0 = | 30 | 20 | 10 | 00 |

-// t1 = | 31 | 21 | 11 | 01 |

-// t8 = | 32 | 22 | 12 | 02 |

-// t9 = | 33 | 23 | 13 | 03 |

-      "precr.qb.ph     $s0, $t1, $t0                     \n"

-      "precr.qb.ph     $s1, $t9, $t8                     \n"

-      "precrq.qb.ph    $s2, $t1, $t0                     \n"

-      "precrq.qb.ph    $s3, $t9, $t8                     \n"

-  // s0 = | 21 | 01 | 20 | 00 |

-  // s1 = | 23 | 03 | 22 | 02 |

-  // s2 = | 31 | 11 | 30 | 10 |

-  // s3 = | 33 | 13 | 32 | 12 |

-      "precr.qb.ph     $s4, $s1, $s0                     \n"

-      "precrq.qb.ph    $s5, $s1, $s0                     \n"

-      "precr.qb.ph     $s6, $s3, $s2                     \n"

-      "precrq.qb.ph    $s7, $s3, $s2                     \n"

-  // s4 = | 03 | 02 | 01 | 00 |

-  // s5 = | 23 | 22 | 21 | 20 |

-  // s6 = | 13 | 12 | 11 | 10 |

-  // s7 = | 33 | 32 | 31 | 30 |

-      "lwx              $t0, $t4(%[src])                 \n"

-      "lwx              $t1, $t5(%[src])                 \n"

-      "lwx              $t8, $t6(%[src])                 \n"

-      "lwx              $t9, $t7(%[src])                 \n"

-// t0 = | 34 | 24 | 14 | 04 |

-// t1 = | 35 | 25 | 15 | 05 |

-// t8 = | 36 | 26 | 16 | 06 |

-// t9 = | 37 | 27 | 17 | 07 |

-      "precr.qb.ph     $s0, $t1, $t0                     \n"

-      "precr.qb.ph     $s1, $t9, $t8                     \n"

-      "precrq.qb.ph    $s2, $t1, $t0                     \n"

-      "precrq.qb.ph    $s3, $t9, $t8                     \n"

-  // s0 = | 25 | 05 | 24 | 04 |

-  // s1 = | 27 | 07 | 26 | 06 |

-  // s2 = | 35 | 15 | 34 | 14 |

-  // s3 = | 37 | 17 | 36 | 16 |

-      "precr.qb.ph     $t0, $s1, $s0                     \n"

-      "precrq.qb.ph    $t1, $s1, $s0                     \n"

-      "precr.qb.ph     $t8, $s3, $s2                     \n"

-      "precrq.qb.ph    $t9, $s3, $s2                     \n"

-  // t0 = | 07 | 06 | 05 | 04 |

-  // t1 = | 27 | 26 | 25 | 24 |

-  // t8 = | 17 | 16 | 15 | 14 |

-  // t9 = | 37 | 36 | 35 | 34 |

-      "addu            $s0, %[dst], %[dst_stride]        \n"

-      "addu            $s1, $s0, %[dst_stride]           \n"

-      "addu            $s2, $s1, %[dst_stride]           \n"

-      "swr              $s4, 0(%[dst])                   \n"

-      "swl              $s4, 3(%[dst])                   \n"

-      "swr              $t0, 4(%[dst])                   \n"

-      "swl              $t0, 7(%[dst])                   \n"

-      "swr              $s6, 0($s0)                      \n"

-      "swl              $s6, 3($s0)                      \n"

-      "swr              $t8, 4($s0)                      \n"

-      "swl              $t8, 7($s0)                      \n"

-      "swr              $s5, 0($s1)                      \n"

-      "swl              $s5, 3($s1)                      \n"

-      "swr              $t1, 4($s1)                      \n"

-      "swl              $t1, 7($s1)                      \n"

-      "swr              $s7, 0($s2)                      \n"

-      "swl              $s7, 3($s2)                      \n"

-      "swr              $t9, 4($s2)                      \n"

-      "swl              $t9, 7($s2)                      \n"

-      "addiu            $AT, -1                          \n"

-      "addiu            %[src], 4                        \n"

-      "bnez             $AT, 11b                         \n"

-      " addu            %[dst], $s2, %[dst_stride]       \n"

-      "2:                                                \n"

-      ".set pop                                          \n"

-      ".set at                                           \n"

-      :[src] "+r" (src),

-       [dst] "+r" (dst),

-       [width] "+r" (width)

-      :[src_stride] "r" (src_stride),

-       [dst_stride] "r" (dst_stride)

-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",

-        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"

-  );

-}

-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,

-                          uint8* dst_a, int dst_stride_a,

-                          uint8* dst_b, int dst_stride_b,

-                          int width) {

-  __asm__ __volatile__ (

-      ".set push                                         \n"

-      ".set noreorder                                    \n"

-      "beqz            %[width], 2f                      \n"

-      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2

-      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4

-      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8

-      "addu            $t3, $t2, %[src_stride]           \n"

-      "addu            $t5, $t4, %[src_stride]           \n"

-      "addu            $t6, $t2, $t4                     \n"

-      "subu            $t7, $t9, %[src_stride]           \n"

-      "srl             $t1, %[width], 1                  \n"

-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b

-      "andi            $t0, %[dst_a], 0x3                \n"

-      "andi            $t8, %[dst_b], 0x3                \n"

-      "or              $t0, $t0, $t8                     \n"

-      "andi            $t8, %[dst_stride_a], 0x3         \n"

-      "andi            $s5, %[dst_stride_b], 0x3         \n"

-      "or              $t8, $t8, $s5                     \n"

-      "or              $t0, $t0, $t8                     \n"

-      "bnez            $t0, 11f                          \n"

-      " nop                                              \n"

-// dst + dst_stride word aligned (both, a & b dst addresses)

-    "1:                                                  \n"

-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|

-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|

-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"

-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|

-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|

-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"

-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|

-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|

-      "sll             $t0, $t0, 16                      \n"

-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|

-      "sll             $t9, $t9, 16                      \n"

-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|

-      "sw              $s3, 0($s5)                       \n"

-      "sw              $s4, 0($s6)                       \n"

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|

-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|

-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|

-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|

-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|

-      "sw              $s3, 0(%[dst_a])                  \n"

-      "sw              $s4, 0(%[dst_b])                  \n"

-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|

-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|

-      "sll             $t0, $t0, 16                      \n"

-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|

-      "sll             $t9, $t9, 16                      \n"

-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|

-      "sw              $s3, 4($s5)                       \n"

-      "sw              $s4, 4($s6)                       \n"

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|

-      "addiu           %[src], 4                         \n"

-      "addiu           $t1, -1                           \n"

-      "sll             $t0, %[dst_stride_a], 1           \n"

-      "sll             $t8, %[dst_stride_b], 1           \n"

-      "sw              $s3, 4(%[dst_a])                  \n"

-      "sw              $s4, 4(%[dst_b])                  \n"

-      "addu            %[dst_a], %[dst_a], $t0           \n"

-      "bnez            $t1, 1b                           \n"

-      " addu           %[dst_b], %[dst_b], $t8           \n"

-      "b               2f                                \n"

-      " nop                                              \n"

-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned

-   "11:                                                  \n"

-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|

-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|

-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"

-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|

-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|

-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"

-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|

-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|

-      "sll             $t0, $t0, 16                      \n"

-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|

-      "sll             $t9, $t9, 16                      \n"

-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|

-      "swr             $s3, 0($s5)                       \n"

-      "swl             $s3, 3($s5)                       \n"

-      "swr             $s4, 0($s6)                       \n"

-      "swl             $s4, 3($s6)                       \n"

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|

-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|

-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|

-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|

-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|

-      "swr             $s3, 0(%[dst_a])                  \n"

-      "swl             $s3, 3(%[dst_a])                  \n"

-      "swr             $s4, 0(%[dst_b])                  \n"

-      "swl             $s4, 3(%[dst_b])                  \n"

-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|

-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|

-      "sll             $t0, $t0, 16                      \n"

-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|

-      "sll             $t9, $t9, 16                      \n"

-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|

-      "swr             $s3, 4($s5)                       \n"

-      "swl             $s3, 7($s5)                       \n"

-      "swr             $s4, 4($s6)                       \n"

-      "swl             $s4, 7($s6)                       \n"

-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|

-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|

-      "addiu           %[src], 4                         \n"

-      "addiu           $t1, -1                           \n"

-      "sll             $t0, %[dst_stride_a], 1           \n"

-      "sll             $t8, %[dst_stride_b], 1           \n"

-      "swr             $s3, 4(%[dst_a])                  \n"

-      "swl             $s3, 7(%[dst_a])                  \n"

-      "swr             $s4, 4(%[dst_b])                  \n"

-      "swl             $s4, 7(%[dst_b])                  \n"

-      "addu            %[dst_a], %[dst_a], $t0           \n"

-      "bnez            $t1, 11b                          \n"

-      " addu           %[dst_b], %[dst_b], $t8           \n"

-      "2:                                                \n"

-      ".set pop                                          \n"

-      : [src] "+r" (src),

-        [dst_a] "+r" (dst_a),

-        [dst_b] "+r" (dst_b),

-        [width] "+r" (width),

-        [src_stride] "+r" (src_stride)

-      : [dst_stride_a] "r" (dst_stride_a),

-        [dst_stride_b] "r" (dst_stride_b)

-      : "t0", "t1",  "t2", "t3",  "t4", "t5",

-        "t6", "t7", "t8", "t9",

-        "s0", "s1", "s2", "s3",

-        "s4", "s5", "s6"

-  );

-}

-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#ifdef __cplusplus

-}  // extern "C"

-}  // namespace libyuv

-#endif

--- /dev/null

+++ b/third_party/libyuv/source/rotate_msa.cc

@@ -1,0 +1,250 @@

+/*

+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "libyuv/rotate_row.h"

+// This module is for GCC MSA

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#include "libyuv/macros_msa.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \

+  {                                                         \

+    out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0);     \

+    out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0);     \

+    out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2);     \

+    out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2);     \

+  }

+#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \

+  {                                                         \

+    out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0);     \

+    out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0);     \

+    out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2);     \

+    out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2);     \

+  }

+#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \

+  {                                                         \

+    out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0);     \

+    out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0);     \

+    out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2);     \

+    out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2);     \

+  }

+#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \

+  {                                                         \

+    out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0);     \

+    out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0);     \

+    out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2);     \

+    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \

+  }

+void TransposeWx16_C(const uint8_t* src,

+                     int src_stride,

+                     uint8_t* dst,

+                     int dst_stride,

+                     int width) {

+  TransposeWx8_C(src, src_stride, dst, dst_stride, width);

+  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,

+                 width);

+}

+void TransposeUVWx16_C(const uint8_t* src,

+                       int src_stride,

+                       uint8_t* dst_a,

+                       int dst_stride_a,

+                       uint8_t* dst_b,

+                       int dst_stride_b,

+                       int width) {

+  TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,

+                   width);

+  TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),

+                   dst_stride_a, (dst_b + 8), dst_stride_b, width);

+}

+void TransposeWx16_MSA(const uint8_t* src,

+                       int src_stride,

+                       uint8_t* dst,

+                       int dst_stride,

+                       int width) {

+  int x;

+  const uint8_t* s;

+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;

+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;

+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;

+  for (x = 0; x < width; x += 16) {

+    s = src;

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);

+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);

+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);

+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);

+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);

+    dst += dst_stride * 4;

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);

+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);

+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);

+    dst += dst_stride * 4;

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);

+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);

+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);

+    dst += dst_stride * 4;

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);

+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);

+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);

+    src += 16;

+    dst += dst_stride * 4;

+  }

+}

+void TransposeUVWx16_MSA(const uint8_t* src,

+                         int src_stride,

+                         uint8_t* dst_a,

+                         int dst_stride_a,

+                         uint8_t* dst_b,

+                         int dst_stride_b,

+                         int width) {

+  int x;

+  const uint8_t* s;

+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;

+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;

+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;

+  for (x = 0; x < width; x += 8) {

+    s = src;

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);

+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);

+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    s += src_stride;

+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);

+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);

+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);

+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);

+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);

+    dst_a += dst_stride_a * 2;

+    dst_b += dst_stride_b * 2;

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);

+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);

+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);

+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);

+    dst_a += dst_stride_a * 2;

+    dst_b += dst_stride_b * 2;

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);

+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);

+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);

+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);

+    dst_a += dst_stride_a * 2;

+    dst_b += dst_stride_b * 2;

+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);

+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);

+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);

+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);

+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);

+    src += 16;

+    dst_a += dst_stride_a * 2;

+    dst_b += dst_stride_b * 2;

+  }

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

--- a/third_party/libyuv/source/rotate_neon.cc

+++ b/third_party/libyuv/source/rotate_neon.cc

@@ -8,8 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "libyuv/row.h"

 #include "libyuv/rotate_row.h"

+#include "libyuv/row.h"

 #include "libyuv/basic_types.h"

@@ -21,38 +21,32 @@

 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \

     !defined(__aarch64__)

-static uvec8 kVTbl4x4Transpose =

-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };

+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,

+                                        2, 6, 10, 14, 3, 7, 11, 15};

-void TransposeWx8_NEON(const uint8* src, int src_stride,

-                       uint8* dst, int dst_stride,

+void TransposeWx8_NEON(const uint8_t* src,

+                       int src_stride,

+                       uint8_t* dst,

+                       int dst_stride,

                        int width) {

-  const uint8* src_temp;

-  asm volatile (

-    // loops are on blocks of 8. loop will stop when

-    // counter gets to or below 0. starting the counter

-    // at w-8 allow for this

-    "sub         %5, #8                        \n"

+  const uint8_t* src_temp;

+  asm volatile(

+      // loops are on blocks of 8. loop will stop when

+      // counter gets to or below 0. starting the counter

+      // at w-8 allow for this

+      "sub         %5, #8                        \n"

-    // handle 8x8 blocks. this should be the majority of the plane

-    "1:                                        \n"

+      // handle 8x8 blocks. this should be the majority of the plane

+      "1:                                        \n"

       "mov         %0, %1                      \n"

-      MEMACCESS(0)

       "vld1.8      {d0}, [%0], %2              \n"

-      MEMACCESS(0)

       "vld1.8      {d1}, [%0], %2              \n"

-      MEMACCESS(0)

       "vld1.8      {d2}, [%0], %2              \n"

-      MEMACCESS(0)

       "vld1.8      {d3}, [%0], %2              \n"

-      MEMACCESS(0)

       "vld1.8      {d4}, [%0], %2              \n"

-      MEMACCESS(0)

       "vld1.8      {d5}, [%0], %2              \n"

-      MEMACCESS(0)

       "vld1.8      {d6}, [%0], %2              \n"

-      MEMACCESS(0)

       "vld1.8      {d7}, [%0]                  \n"

       "vtrn.8      d1, d0                      \n"

@@ -77,21 +71,13 @@

       "mov         %0, %3                      \n"

-    MEMACCESS(0)

       "vst1.8      {d1}, [%0], %4              \n"

-    MEMACCESS(0)

       "vst1.8      {d0}, [%0], %4              \n"

-    MEMACCESS(0)

       "vst1.8      {d3}, [%0], %4              \n"

-    MEMACCESS(0)

       "vst1.8      {d2}, [%0], %4              \n"

-    MEMACCESS(0)

       "vst1.8      {d5}, [%0], %4              \n"

-    MEMACCESS(0)

       "vst1.8      {d4}, [%0], %4              \n"

-    MEMACCESS(0)

       "vst1.8      {d7}, [%0], %4              \n"

-    MEMACCESS(0)

       "vst1.8      {d6}, [%0]                  \n"

       "add         %1, #8                      \n"  // src += 8

@@ -99,180 +85,138 @@

       "subs        %5,  #8                     \n"  // w   -= 8

       "bge         1b                          \n"

-    // add 8 back to counter. if the result is 0 there are

-    // no residuals.

-    "adds        %5, #8                        \n"

-    "beq         4f                            \n"

+      // add 8 back to counter. if the result is 0 there are

+      // no residuals.

+      "adds        %5, #8                        \n"

+      "beq         4f                            \n"

-    // some residual, so between 1 and 7 lines left to transpose

-    "cmp         %5, #2                        \n"

-    "blt         3f                            \n"

+      // some residual, so between 1 and 7 lines left to transpose

+      "cmp         %5, #2                        \n"

+      "blt         3f                            \n"

-    "cmp         %5, #4                        \n"

-    "blt         2f                            \n"

+      "cmp         %5, #4                        \n"

+      "blt         2f                            \n"

-    // 4x8 block

-    "mov         %0, %1                        \n"

-    MEMACCESS(0)

-    "vld1.32     {d0[0]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.32     {d0[1]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.32     {d1[0]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.32     {d1[1]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.32     {d2[0]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.32     {d2[1]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.32     {d3[0]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.32     {d3[1]}, [%0]                 \n"

+      // 4x8 block

+      "mov         %0, %1                        \n"

+      "vld1.32     {d0[0]}, [%0], %2             \n"

+      "vld1.32     {d0[1]}, [%0], %2             \n"

+      "vld1.32     {d1[0]}, [%0], %2             \n"

+      "vld1.32     {d1[1]}, [%0], %2             \n"

+      "vld1.32     {d2[0]}, [%0], %2             \n"

+      "vld1.32     {d2[1]}, [%0], %2             \n"

+      "vld1.32     {d3[0]}, [%0], %2             \n"

+      "vld1.32     {d3[1]}, [%0]                 \n"

-    "mov         %0, %3                        \n"

+      "mov         %0, %3                        \n"

-    MEMACCESS(6)

-    "vld1.8      {q3}, [%6]                    \n"

+      "vld1.8      {q3}, [%6]                    \n"

-    "vtbl.8      d4, {d0, d1}, d6              \n"

-    "vtbl.8      d5, {d0, d1}, d7              \n"

-    "vtbl.8      d0, {d2, d3}, d6              \n"

-    "vtbl.8      d1, {d2, d3}, d7              \n"

+      "vtbl.8      d4, {d0, d1}, d6              \n"

+      "vtbl.8      d5, {d0, d1}, d7              \n"

+      "vtbl.8      d0, {d2, d3}, d6              \n"

+      "vtbl.8      d1, {d2, d3}, d7              \n"

-    // TODO(frkoenig): Rework shuffle above to

-    // write out with 4 instead of 8 writes.

-    MEMACCESS(0)

-    "vst1.32     {d4[0]}, [%0], %4             \n"

-    MEMACCESS(0)

-    "vst1.32     {d4[1]}, [%0], %4             \n"

-    MEMACCESS(0)

-    "vst1.32     {d5[0]}, [%0], %4             \n"

-    MEMACCESS(0)

-    "vst1.32     {d5[1]}, [%0]                 \n"

+      // TODO(frkoenig): Rework shuffle above to

+      // write out with 4 instead of 8 writes.

+      "vst1.32     {d4[0]}, [%0], %4             \n"

+      "vst1.32     {d4[1]}, [%0], %4             \n"

+      "vst1.32     {d5[0]}, [%0], %4             \n"

+      "vst1.32     {d5[1]}, [%0]                 \n"

-    "add         %0, %3, #4                    \n"

-    MEMACCESS(0)

-    "vst1.32     {d0[0]}, [%0], %4             \n"

-    MEMACCESS(0)

-    "vst1.32     {d0[1]}, [%0], %4             \n"

-    MEMACCESS(0)

-    "vst1.32     {d1[0]}, [%0], %4             \n"

-    MEMACCESS(0)

-    "vst1.32     {d1[1]}, [%0]                 \n"

+      "add         %0, %3, #4                    \n"

+      "vst1.32     {d0[0]}, [%0], %4             \n"

+      "vst1.32     {d0[1]}, [%0], %4             \n"

+      "vst1.32     {d1[0]}, [%0], %4             \n"

+      "vst1.32     {d1[1]}, [%0]                 \n"

-    "add         %1, #4                        \n"  // src += 4

-    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride

-    "subs        %5,  #4                       \n"  // w   -= 4

-    "beq         4f                            \n"

+      "add         %1, #4                        \n"  // src += 4

+      "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride

+      "subs        %5,  #4                       \n"  // w   -= 4

+      "beq         4f                            \n"

-    // some residual, check to see if it includes a 2x8 block,

-    // or less

-    "cmp         %5, #2                        \n"

-    "blt         3f                            \n"

+      // some residual, check to see if it includes a 2x8 block,

+      // or less

+      "cmp         %5, #2                        \n"

+      "blt         3f                            \n"

-    // 2x8 block

-    "2:                                        \n"

-    "mov         %0, %1                        \n"

-    MEMACCESS(0)

-    "vld1.16     {d0[0]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.16     {d1[0]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.16     {d0[1]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.16     {d1[1]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.16     {d0[2]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.16     {d1[2]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.16     {d0[3]}, [%0], %2             \n"

-    MEMACCESS(0)

-    "vld1.16     {d1[3]}, [%0]                 \n"

+      // 2x8 block

+      "2:                                        \n"

+      "mov         %0, %1                        \n"

+      "vld1.16     {d0[0]}, [%0], %2             \n"

+      "vld1.16     {d1[0]}, [%0], %2             \n"

+      "vld1.16     {d0[1]}, [%0], %2             \n"

+      "vld1.16     {d1[1]}, [%0], %2             \n"

+      "vld1.16     {d0[2]}, [%0], %2             \n"

+      "vld1.16     {d1[2]}, [%0], %2             \n"

+      "vld1.16     {d0[3]}, [%0], %2             \n"

+      "vld1.16     {d1[3]}, [%0]                 \n"

-    "vtrn.8      d0, d1                        \n"

+      "vtrn.8      d0, d1                        \n"

-    "mov         %0, %3                        \n"

+      "mov         %0, %3                        \n"

-    MEMACCESS(0)

-    "vst1.64     {d0}, [%0], %4                \n"

-    MEMACCESS(0)

-    "vst1.64     {d1}, [%0]                    \n"

+      "vst1.64     {d0}, [%0], %4                \n"

+      "vst1.64     {d1}, [%0]                    \n"

-    "add         %1, #2                        \n"  // src += 2

-    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride

-    "subs        %5,  #2                       \n"  // w   -= 2

-    "beq         4f                            \n"

+      "add         %1, #2                        \n"  // src += 2

+      "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride

+      "subs        %5,  #2                       \n"  // w   -= 2

+      "beq         4f                            \n"

-    // 1x8 block

-    "3:                                        \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[0]}, [%1], %2             \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[1]}, [%1], %2             \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[2]}, [%1], %2             \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[3]}, [%1], %2             \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[4]}, [%1], %2             \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[5]}, [%1], %2             \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[6]}, [%1], %2             \n"

-    MEMACCESS(1)

-    "vld1.8      {d0[7]}, [%1]                 \n"

+      // 1x8 block

+      "3:                                        \n"

+      "vld1.8      {d0[0]}, [%1], %2             \n"

+      "vld1.8      {d0[1]}, [%1], %2             \n"

+      "vld1.8      {d0[2]}, [%1], %2             \n"

+      "vld1.8      {d0[3]}, [%1], %2             \n"

+      "vld1.8      {d0[4]}, [%1], %2             \n"

+      "vld1.8      {d0[5]}, [%1], %2             \n"

+      "vld1.8      {d0[6]}, [%1], %2             \n"

+      "vld1.8      {d0[7]}, [%1]                 \n"

-    MEMACCESS(3)

-    "vst1.64     {d0}, [%3]                    \n"

+      "vst1.64     {d0}, [%3]                    \n"

-    "4:                                        \n"

+      "4:                                        \n"

-    : "=&r"(src_temp),         // %0

-      "+r"(src),               // %1

-      "+r"(src_stride),        // %2

-      "+r"(dst),               // %3

-      "+r"(dst_stride),        // %4

-      "+r"(width)              // %5

-    : "r"(&kVTbl4x4Transpose)  // %6

-    : "memory", "cc", "q0", "q1", "q2", "q3"

-  );

+      : "=&r"(src_temp),         // %0

+        "+r"(src),               // %1

+        "+r"(src_stride),        // %2

+        "+r"(dst),               // %3

+        "+r"(dst_stride),        // %4

+        "+r"(width)              // %5

+      : "r"(&kVTbl4x4Transpose)  // %6

+      : "memory", "cc", "q0", "q1", "q2", "q3");

-static uvec8 kVTbl4x4TransposeDi =

-  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };

+static const uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,

+                                          4, 12, 5, 13, 6, 14, 7, 15};

-void TransposeUVWx8_NEON(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b,

+void TransposeUVWx8_NEON(const uint8_t* src,

+                         int src_stride,

+                         uint8_t* dst_a,

+                         int dst_stride_a,

+                         uint8_t* dst_b,

+                         int dst_stride_b,

                          int width) {

-  const uint8* src_temp;

-  asm volatile (

-    // loops are on blocks of 8. loop will stop when

-    // counter gets to or below 0. starting the counter

-    // at w-8 allow for this

-    "sub         %7, #8                        \n"

+  const uint8_t* src_temp;

+  asm volatile(

+      // loops are on blocks of 8. loop will stop when

+      // counter gets to or below 0. starting the counter

+      // at w-8 allow for this

+      "sub         %7, #8                        \n"

-    // handle 8x8 blocks. this should be the majority of the plane

-    "1:                                        \n"

+      // handle 8x8 blocks. this should be the majority of the plane

+      "1:                                        \n"

       "mov         %0, %1                      \n"

-      MEMACCESS(0)

       "vld2.8      {d0,  d1},  [%0], %2        \n"

-      MEMACCESS(0)

       "vld2.8      {d2,  d3},  [%0], %2        \n"

-      MEMACCESS(0)

       "vld2.8      {d4,  d5},  [%0], %2        \n"

-      MEMACCESS(0)

       "vld2.8      {d6,  d7},  [%0], %2        \n"

-      MEMACCESS(0)

       "vld2.8      {d16, d17}, [%0], %2        \n"

-      MEMACCESS(0)

       "vld2.8      {d18, d19}, [%0], %2        \n"

-      MEMACCESS(0)

       "vld2.8      {d20, d21}, [%0], %2        \n"

-      MEMACCESS(0)

       "vld2.8      {d22, d23}, [%0]            \n"

       "vtrn.8      q1, q0                      \n"

@@ -301,40 +245,24 @@

       "mov         %0, %3                      \n"

-    MEMACCESS(0)

       "vst1.8      {d2},  [%0], %4             \n"

-    MEMACCESS(0)

       "vst1.8      {d0},  [%0], %4             \n"

-    MEMACCESS(0)

       "vst1.8      {d6},  [%0], %4             \n"

-    MEMACCESS(0)

       "vst1.8      {d4},  [%0], %4             \n"

-    MEMACCESS(0)

       "vst1.8      {d18}, [%0], %4             \n"

-    MEMACCESS(0)

       "vst1.8      {d16}, [%0], %4             \n"

-    MEMACCESS(0)

       "vst1.8      {d22}, [%0], %4             \n"

-    MEMACCESS(0)

       "vst1.8      {d20}, [%0]                 \n"

       "mov         %0, %5                      \n"

-    MEMACCESS(0)

       "vst1.8      {d3},  [%0], %6             \n"

-    MEMACCESS(0)

       "vst1.8      {d1},  [%0], %6             \n"

-    MEMACCESS(0)

       "vst1.8      {d7},  [%0], %6             \n"

-    MEMACCESS(0)

       "vst1.8      {d5},  [%0], %6             \n"

-    MEMACCESS(0)

       "vst1.8      {d19}, [%0], %6             \n"

-    MEMACCESS(0)

       "vst1.8      {d17}, [%0], %6             \n"

-    MEMACCESS(0)

       "vst1.8      {d23}, [%0], %6             \n"

-    MEMACCESS(0)

       "vst1.8      {d21}, [%0]                 \n"

       "add         %1, #8*2                    \n"  // src   += 8*2

@@ -343,187 +271,142 @@

       "subs        %7,  #8                     \n"  // w     -= 8

       "bge         1b                          \n"

-    // add 8 back to counter. if the result is 0 there are

-    // no residuals.

-    "adds        %7, #8                        \n"

-    "beq         4f                            \n"

+      // add 8 back to counter. if the result is 0 there are

+      // no residuals.

+      "adds        %7, #8                        \n"

+      "beq         4f                            \n"

-    // some residual, so between 1 and 7 lines left to transpose

-    "cmp         %7, #2                        \n"

-    "blt         3f                            \n"

+      // some residual, so between 1 and 7 lines left to transpose

+      "cmp         %7, #2                        \n"

+      "blt         3f                            \n"

-    "cmp         %7, #4                        \n"

-    "blt         2f                            \n"

+      "cmp         %7, #4                        \n"

+      "blt         2f                            \n"

-    // TODO(frkoenig): Clean this up

-    // 4x8 block

-    "mov         %0, %1                        \n"

-    MEMACCESS(0)

-    "vld1.64     {d0}, [%0], %2                \n"

-    MEMACCESS(0)

-    "vld1.64     {d1}, [%0], %2                \n"

-    MEMACCESS(0)

-    "vld1.64     {d2}, [%0], %2                \n"

-    MEMACCESS(0)

-    "vld1.64     {d3}, [%0], %2                \n"

-    MEMACCESS(0)

-    "vld1.64     {d4}, [%0], %2                \n"

-    MEMACCESS(0)

-    "vld1.64     {d5}, [%0], %2                \n"

-    MEMACCESS(0)

-    "vld1.64     {d6}, [%0], %2                \n"

-    MEMACCESS(0)

-    "vld1.64     {d7}, [%0]                    \n"

+      // TODO(frkoenig): Clean this up

+      // 4x8 block

+      "mov         %0, %1                        \n"

+      "vld1.64     {d0}, [%0], %2                \n"

+      "vld1.64     {d1}, [%0], %2                \n"

+      "vld1.64     {d2}, [%0], %2                \n"

+      "vld1.64     {d3}, [%0], %2                \n"

+      "vld1.64     {d4}, [%0], %2                \n"

+      "vld1.64     {d5}, [%0], %2                \n"

+      "vld1.64     {d6}, [%0], %2                \n"

+      "vld1.64     {d7}, [%0]                    \n"

-    MEMACCESS(8)

-    "vld1.8      {q15}, [%8]                   \n"

+      "vld1.8      {q15}, [%8]                   \n"

-    "vtrn.8      q0, q1                        \n"

-    "vtrn.8      q2, q3                        \n"

+      "vtrn.8      q0, q1                        \n"

+      "vtrn.8      q2, q3                        \n"

-    "vtbl.8      d16, {d0, d1}, d30            \n"

-    "vtbl.8      d17, {d0, d1}, d31            \n"

-    "vtbl.8      d18, {d2, d3}, d30            \n"

-    "vtbl.8      d19, {d2, d3}, d31            \n"

-    "vtbl.8      d20, {d4, d5}, d30            \n"

-    "vtbl.8      d21, {d4, d5}, d31            \n"

-    "vtbl.8      d22, {d6, d7}, d30            \n"

-    "vtbl.8      d23, {d6, d7}, d31            \n"

+      "vtbl.8      d16, {d0, d1}, d30            \n"

+      "vtbl.8      d17, {d0, d1}, d31            \n"

+      "vtbl.8      d18, {d2, d3}, d30            \n"

+      "vtbl.8      d19, {d2, d3}, d31            \n"

+      "vtbl.8      d20, {d4, d5}, d30            \n"

+      "vtbl.8      d21, {d4, d5}, d31            \n"

+      "vtbl.8      d22, {d6, d7}, d30            \n"

+      "vtbl.8      d23, {d6, d7}, d31            \n"

-    "mov         %0, %3                        \n"

+      "mov         %0, %3                        \n"

-    MEMACCESS(0)

-    "vst1.32     {d16[0]},  [%0], %4           \n"

-    MEMACCESS(0)

-    "vst1.32     {d16[1]},  [%0], %4           \n"

-    MEMACCESS(0)

-    "vst1.32     {d17[0]},  [%0], %4           \n"

-    MEMACCESS(0)

-    "vst1.32     {d17[1]},  [%0], %4           \n"

+      "vst1.32     {d16[0]},  [%0], %4           \n"

+      "vst1.32     {d16[1]},  [%0], %4           \n"

+      "vst1.32     {d17[0]},  [%0], %4           \n"

+      "vst1.32     {d17[1]},  [%0], %4           \n"

-    "add         %0, %3, #4                    \n"

-    MEMACCESS(0)

-    "vst1.32     {d20[0]}, [%0], %4            \n"

-    MEMACCESS(0)

-    "vst1.32     {d20[1]}, [%0], %4            \n"

-    MEMACCESS(0)

-    "vst1.32     {d21[0]}, [%0], %4            \n"

-    MEMACCESS(0)

-    "vst1.32     {d21[1]}, [%0]                \n"

+      "add         %0, %3, #4                    \n"

+      "vst1.32     {d20[0]}, [%0], %4            \n"

+      "vst1.32     {d20[1]}, [%0], %4            \n"

+      "vst1.32     {d21[0]}, [%0], %4            \n"

+      "vst1.32     {d21[1]}, [%0]                \n"

-    "mov         %0, %5                        \n"

+      "mov         %0, %5                        \n"

-    MEMACCESS(0)

-    "vst1.32     {d18[0]}, [%0], %6            \n"

-    MEMACCESS(0)

-    "vst1.32     {d18[1]}, [%0], %6            \n"

-    MEMACCESS(0)

-    "vst1.32     {d19[0]}, [%0], %6            \n"

-    MEMACCESS(0)

-    "vst1.32     {d19[1]}, [%0], %6            \n"

+      "vst1.32     {d18[0]}, [%0], %6            \n"

+      "vst1.32     {d18[1]}, [%0], %6            \n"

+      "vst1.32     {d19[0]}, [%0], %6            \n"

+      "vst1.32     {d19[1]}, [%0], %6            \n"

-    "add         %0, %5, #4                    \n"

-    MEMACCESS(0)

-    "vst1.32     {d22[0]},  [%0], %6           \n"

-    MEMACCESS(0)

-    "vst1.32     {d22[1]},  [%0], %6           \n"

-    MEMACCESS(0)

-    "vst1.32     {d23[0]},  [%0], %6           \n"

-    MEMACCESS(0)

-    "vst1.32     {d23[1]},  [%0]               \n"

+      "add         %0, %5, #4                    \n"

+      "vst1.32     {d22[0]},  [%0], %6           \n"

+      "vst1.32     {d22[1]},  [%0], %6           \n"

+      "vst1.32     {d23[0]},  [%0], %6           \n"

+      "vst1.32     {d23[1]},  [%0]               \n"

-    "add         %1, #4*2                      \n"  // src   += 4 * 2

-    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a

-    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b

-    "subs        %7,  #4                       \n"  // w     -= 4

-    "beq         4f                            \n"

+      "add         %1, #4*2                      \n"  // src   += 4 * 2

+      "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 *

+                                                      // dst_stride_a

+      "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 *

+                                                      // dst_stride_b

+      "subs        %7,  #4                       \n"  // w     -= 4

+      "beq         4f                            \n"

-    // some residual, check to see if it includes a 2x8 block,

-    // or less

-    "cmp         %7, #2                        \n"

-    "blt         3f                            \n"

+      // some residual, check to see if it includes a 2x8 block,

+      // or less

+      "cmp         %7, #2                        \n"

+      "blt         3f                            \n"

-    // 2x8 block

-    "2:                                        \n"

-    "mov         %0, %1                        \n"

-    MEMACCESS(0)

-    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"

-    MEMACCESS(0)

-    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"

-    MEMACCESS(0)

-    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"

-    MEMACCESS(0)

-    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"

-    MEMACCESS(0)

-    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"

-    MEMACCESS(0)

-    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"

-    MEMACCESS(0)

-    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"

-    MEMACCESS(0)

-    "vld2.16     {d1[3], d3[3]}, [%0]          \n"

+      // 2x8 block

+      "2:                                        \n"

+      "mov         %0, %1                        \n"

+      "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"

+      "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"

+      "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"

+      "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"

+      "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"

+      "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"

+      "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"

+      "vld2.16     {d1[3], d3[3]}, [%0]          \n"

-    "vtrn.8      d0, d1                        \n"

-    "vtrn.8      d2, d3                        \n"

+      "vtrn.8      d0, d1                        \n"

+      "vtrn.8      d2, d3                        \n"

-    "mov         %0, %3                        \n"

+      "mov         %0, %3                        \n"

-    MEMACCESS(0)

-    "vst1.64     {d0}, [%0], %4                \n"

-    MEMACCESS(0)

-    "vst1.64     {d2}, [%0]                    \n"

+      "vst1.64     {d0}, [%0], %4                \n"

+      "vst1.64     {d2}, [%0]                    \n"

-    "mov         %0, %5                        \n"

+      "mov         %0, %5                        \n"

-    MEMACCESS(0)

-    "vst1.64     {d1}, [%0], %6                \n"

-    MEMACCESS(0)

-    "vst1.64     {d3}, [%0]                    \n"

+      "vst1.64     {d1}, [%0], %6                \n"

+      "vst1.64     {d3}, [%0]                    \n"

-    "add         %1, #2*2                      \n"  // src   += 2 * 2

-    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a

-    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b

-    "subs        %7,  #2                       \n"  // w     -= 2

-    "beq         4f                            \n"

+      "add         %1, #2*2                      \n"  // src   += 2 * 2

+      "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 *

+                                                      // dst_stride_a

+      "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 *

+                                                      // dst_stride_b

+      "subs        %7,  #2                       \n"  // w     -= 2

+      "beq         4f                            \n"

-    // 1x8 block

-    "3:                                        \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"

-    MEMACCESS(1)

-    "vld2.8      {d0[7], d1[7]}, [%1]          \n"

+      // 1x8 block

+      "3:                                        \n"

+      "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"

+      "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"

+      "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"

+      "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"

+      "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"

+      "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"

+      "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"

+      "vld2.8      {d0[7], d1[7]}, [%1]          \n"

-    MEMACCESS(3)

-    "vst1.64     {d0}, [%3]                    \n"

-    MEMACCESS(5)

-    "vst1.64     {d1}, [%5]                    \n"

+      "vst1.64     {d0}, [%3]                    \n"

+      "vst1.64     {d1}, [%5]                    \n"

-    "4:                                        \n"

+      "4:                                        \n"

-    : "=&r"(src_temp),           // %0

-      "+r"(src),                 // %1

-      "+r"(src_stride),          // %2

-      "+r"(dst_a),               // %3

-      "+r"(dst_stride_a),        // %4

-      "+r"(dst_b),               // %5

-      "+r"(dst_stride_b),        // %6

-      "+r"(width)                // %7

-    : "r"(&kVTbl4x4TransposeDi)  // %8

-    : "memory", "cc",

-      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"

-  );

+      : "=&r"(src_temp),           // %0

+        "+r"(src),                 // %1

+        "+r"(src_stride),          // %2

+        "+r"(dst_a),               // %3

+        "+r"(dst_stride_a),        // %4

+        "+r"(dst_b),               // %5

+        "+r"(dst_stride_b),        // %6

+        "+r"(width)                // %7

+      : "r"(&kVTbl4x4TransposeDi)  // %8

+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");

 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)

--- a/third_party/libyuv/source/rotate_neon64.cc

+++ b/third_party/libyuv/source/rotate_neon64.cc

@@ -8,8 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "libyuv/row.h"

 #include "libyuv/rotate_row.h"

+#include "libyuv/row.h"

 #include "libyuv/basic_types.h"

@@ -21,38 +21,32 @@

 // This module is for GCC Neon armv8 64 bit.

 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

-static uvec8 kVTbl4x4Transpose =

-  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };

+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,

+                                        2, 6, 10, 14, 3, 7, 11, 15};

-void TransposeWx8_NEON(const uint8* src, int src_stride,

-                       uint8* dst, int dst_stride, int width) {

-  const uint8* src_temp;

-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.

-  asm volatile (

-    // loops are on blocks of 8. loop will stop when

-    // counter gets to or below 0. starting the counter

-    // at w-8 allow for this

-    "sub         %3, %3, #8                      \n"

+void TransposeWx8_NEON(const uint8_t* src,

+                       int src_stride,

+                       uint8_t* dst,

+                       int dst_stride,

+                       int width) {

+  const uint8_t* src_temp;

+  asm volatile(

+      // loops are on blocks of 8. loop will stop when

+      // counter gets to or below 0. starting the counter

+      // at w-8 allow for this

+      "sub         %w3, %w3, #8                     \n"

-    // handle 8x8 blocks. this should be the majority of the plane

-    "1:                                          \n"

+      // handle 8x8 blocks. this should be the majority of the plane

+      "1:                                          \n"

       "mov         %0, %1                        \n"

-      MEMACCESS(0)

       "ld1        {v0.8b}, [%0], %5              \n"

-      MEMACCESS(0)

       "ld1        {v1.8b}, [%0], %5              \n"

-      MEMACCESS(0)

       "ld1        {v2.8b}, [%0], %5              \n"

-      MEMACCESS(0)

       "ld1        {v3.8b}, [%0], %5              \n"

-      MEMACCESS(0)

       "ld1        {v4.8b}, [%0], %5              \n"

-      MEMACCESS(0)

       "ld1        {v5.8b}, [%0], %5              \n"

-      MEMACCESS(0)

       "ld1        {v6.8b}, [%0], %5              \n"

-      MEMACCESS(0)

       "ld1        {v7.8b}, [%0]                  \n"

       "trn2     v16.8b, v0.8b, v1.8b             \n"

@@ -84,456 +78,345 @@

       "mov         %0, %2                        \n"

-    MEMACCESS(0)

       "st1      {v17.8b}, [%0], %6               \n"

-    MEMACCESS(0)

       "st1      {v16.8b}, [%0], %6               \n"

-    MEMACCESS(0)

       "st1      {v19.8b}, [%0], %6               \n"

-    MEMACCESS(0)

       "st1      {v18.8b}, [%0], %6               \n"

-    MEMACCESS(0)

       "st1      {v21.8b}, [%0], %6               \n"

-    MEMACCESS(0)

       "st1      {v20.8b}, [%0], %6               \n"

-    MEMACCESS(0)

       "st1      {v23.8b}, [%0], %6               \n"

-    MEMACCESS(0)

       "st1      {v22.8b}, [%0]                   \n"

       "add         %1, %1, #8                    \n"  // src += 8

       "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride

-      "subs        %3, %3, #8                    \n"  // w   -= 8

+      "subs        %w3, %w3, #8                  \n"  // w   -= 8

       "b.ge        1b                            \n"

-    // add 8 back to counter. if the result is 0 there are

-    // no residuals.

-    "adds        %3, %3, #8                      \n"

-    "b.eq        4f                              \n"

+      // add 8 back to counter. if the result is 0 there are

+      // no residuals.

+      "adds        %w3, %w3, #8                    \n"

+      "b.eq        4f                              \n"

-    // some residual, so between 1 and 7 lines left to transpose

-    "cmp         %3, #2                          \n"

-    "b.lt        3f                              \n"

+      // some residual, so between 1 and 7 lines left to transpose

+      "cmp         %w3, #2                          \n"

+      "b.lt        3f                              \n"

-    "cmp         %3, #4                          \n"

-    "b.lt        2f                              \n"

+      "cmp         %w3, #4                          \n"

+      "b.lt        2f                              \n"

-    // 4x8 block

-    "mov         %0, %1                          \n"

-    MEMACCESS(0)

-    "ld1     {v0.s}[0], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v0.s}[1], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v0.s}[2], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v0.s}[3], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.s}[0], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.s}[1], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.s}[2], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.s}[3], [%0]                     \n"

+      // 4x8 block

+      "mov         %0, %1                          \n"

+      "ld1     {v0.s}[0], [%0], %5                 \n"

+      "ld1     {v0.s}[1], [%0], %5                 \n"

+      "ld1     {v0.s}[2], [%0], %5                 \n"

+      "ld1     {v0.s}[3], [%0], %5                 \n"

+      "ld1     {v1.s}[0], [%0], %5                 \n"

+      "ld1     {v1.s}[1], [%0], %5                 \n"

+      "ld1     {v1.s}[2], [%0], %5                 \n"

+      "ld1     {v1.s}[3], [%0]                     \n"

-    "mov         %0, %2                          \n"

+      "mov         %0, %2                          \n"

-    MEMACCESS(4)

-    "ld1      {v2.16b}, [%4]                     \n"

+      "ld1      {v2.16b}, [%4]                     \n"

-    "tbl      v3.16b, {v0.16b}, v2.16b           \n"

-    "tbl      v0.16b, {v1.16b}, v2.16b           \n"

+      "tbl      v3.16b, {v0.16b}, v2.16b           \n"

+      "tbl      v0.16b, {v1.16b}, v2.16b           \n"

-    // TODO(frkoenig): Rework shuffle above to

-    // write out with 4 instead of 8 writes.

-    MEMACCESS(0)

-    "st1 {v3.s}[0], [%0], %6                     \n"

-    MEMACCESS(0)

-    "st1 {v3.s}[1], [%0], %6                     \n"

-    MEMACCESS(0)

-    "st1 {v3.s}[2], [%0], %6                     \n"

-    MEMACCESS(0)

-    "st1 {v3.s}[3], [%0]                         \n"

+      // TODO(frkoenig): Rework shuffle above to

+      // write out with 4 instead of 8 writes.

+      "st1 {v3.s}[0], [%0], %6                     \n"

+      "st1 {v3.s}[1], [%0], %6                     \n"

+      "st1 {v3.s}[2], [%0], %6                     \n"

+      "st1 {v3.s}[3], [%0]                         \n"

-    "add         %0, %2, #4                      \n"

-    MEMACCESS(0)

-    "st1 {v0.s}[0], [%0], %6                     \n"

-    MEMACCESS(0)

-    "st1 {v0.s}[1], [%0], %6                     \n"

-    MEMACCESS(0)

-    "st1 {v0.s}[2], [%0], %6                     \n"

-    MEMACCESS(0)

-    "st1 {v0.s}[3], [%0]                         \n"

+      "add         %0, %2, #4                      \n"

+      "st1 {v0.s}[0], [%0], %6                     \n"

+      "st1 {v0.s}[1], [%0], %6                     \n"

+      "st1 {v0.s}[2], [%0], %6                     \n"

+      "st1 {v0.s}[3], [%0]                         \n"

-    "add         %1, %1, #4                      \n"  // src += 4

-    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride

-    "subs        %3, %3, #4                      \n"  // w   -= 4

-    "b.eq        4f                              \n"

+      "add         %1, %1, #4                      \n"  // src += 4

+      "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride

+      "subs        %w3, %w3, #4                    \n"  // w   -= 4

+      "b.eq        4f                              \n"

-    // some residual, check to see if it includes a 2x8 block,

-    // or less

-    "cmp         %3, #2                          \n"

-    "b.lt        3f                              \n"

+      // some residual, check to see if it includes a 2x8 block,

+      // or less

+      "cmp         %w3, #2                         \n"

+      "b.lt        3f                              \n"

-    // 2x8 block

-    "2:                                          \n"

-    "mov         %0, %1                          \n"

-    MEMACCESS(0)

-    "ld1     {v0.h}[0], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.h}[0], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v0.h}[1], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.h}[1], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v0.h}[2], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.h}[2], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v0.h}[3], [%0], %5                 \n"

-    MEMACCESS(0)

-    "ld1     {v1.h}[3], [%0]                     \n"

+      // 2x8 block

+      "2:                                          \n"

+      "mov         %0, %1                          \n"

+      "ld1     {v0.h}[0], [%0], %5                 \n"

+      "ld1     {v1.h}[0], [%0], %5                 \n"

+      "ld1     {v0.h}[1], [%0], %5                 \n"

+      "ld1     {v1.h}[1], [%0], %5                 \n"

+      "ld1     {v0.h}[2], [%0], %5                 \n"

+      "ld1     {v1.h}[2], [%0], %5                 \n"

+      "ld1     {v0.h}[3], [%0], %5                 \n"

+      "ld1     {v1.h}[3], [%0]                     \n"

-    "trn2    v2.8b, v0.8b, v1.8b                 \n"

-    "trn1    v3.8b, v0.8b, v1.8b                 \n"

+      "trn2    v2.8b, v0.8b, v1.8b                 \n"

+      "trn1    v3.8b, v0.8b, v1.8b                 \n"

-    "mov         %0, %2                          \n"

+      "mov         %0, %2                          \n"

-    MEMACCESS(0)

-    "st1     {v3.8b}, [%0], %6                   \n"

-    MEMACCESS(0)

-    "st1     {v2.8b}, [%0]                       \n"

+      "st1     {v3.8b}, [%0], %6                   \n"

+      "st1     {v2.8b}, [%0]                       \n"

-    "add         %1, %1, #2                      \n"  // src += 2

-    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride

-    "subs        %3, %3,  #2                     \n"  // w   -= 2

-    "b.eq        4f                              \n"

+      "add         %1, %1, #2                      \n"  // src += 2

+      "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride

+      "subs        %w3, %w3,  #2                   \n"  // w   -= 2

+      "b.eq        4f                              \n"

-    // 1x8 block

-    "3:                                          \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[0], [%1], %5             \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[1], [%1], %5             \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[2], [%1], %5             \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[3], [%1], %5             \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[4], [%1], %5             \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[5], [%1], %5             \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[6], [%1], %5             \n"

-    MEMACCESS(1)

-    "ld1         {v0.b}[7], [%1]                 \n"

+      // 1x8 block

+      "3:                                          \n"

+      "ld1         {v0.b}[0], [%1], %5             \n"

+      "ld1         {v0.b}[1], [%1], %5             \n"

+      "ld1         {v0.b}[2], [%1], %5             \n"

+      "ld1         {v0.b}[3], [%1], %5             \n"

+      "ld1         {v0.b}[4], [%1], %5             \n"

+      "ld1         {v0.b}[5], [%1], %5             \n"

+      "ld1         {v0.b}[6], [%1], %5             \n"

+      "ld1         {v0.b}[7], [%1]                 \n"

-    MEMACCESS(2)

-    "st1         {v0.8b}, [%2]                   \n"

+      "st1         {v0.8b}, [%2]                   \n"

-    "4:                                          \n"

+      "4:                                          \n"

-    : "=&r"(src_temp),                            // %0

-      "+r"(src),                                  // %1

-      "+r"(dst),                                  // %2

-      "+r"(width64)                               // %3

-    : "r"(&kVTbl4x4Transpose),                    // %4

-      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5

-      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6

-    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",

-      "v17", "v18", "v19", "v20", "v21", "v22", "v23"

-  );

+      : "=&r"(src_temp),                          // %0

+        "+r"(src),                                // %1

+        "+r"(dst),                                // %2

+        "+r"(width)                               // %3

+      : "r"(&kVTbl4x4Transpose),                  // %4

+        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5

+        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6

+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",

+        "v17", "v18", "v19", "v20", "v21", "v22", "v23");

-static uint8 kVTbl4x4TransposeDi[32] =

-  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,

-    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};

+static const uint8_t kVTbl4x4TransposeDi[32] = {

+    0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,

+    1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};

-void TransposeUVWx8_NEON(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b,

+void TransposeUVWx8_NEON(const uint8_t* src,

+                         int src_stride,

+                         uint8_t* dst_a,

+                         int dst_stride_a,

+                         uint8_t* dst_b,

+                         int dst_stride_b,

                          int width) {

-  const uint8* src_temp;

-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.

-  asm volatile (

-    // loops are on blocks of 8. loop will stop when

-    // counter gets to or below 0. starting the counter

-    // at w-8 allow for this

-    "sub       %4, %4, #8                      \n"

+  const uint8_t* src_temp;

+  asm volatile(

+      // loops are on blocks of 8. loop will stop when

+      // counter gets to or below 0. starting the counter

+      // at w-8 allow for this

+      "sub       %w4, %w4, #8                    \n"

-    // handle 8x8 blocks. this should be the majority of the plane

-    "1:                                        \n"

-    "mov       %0, %1                          \n"

+      // handle 8x8 blocks. this should be the majority of the plane

+      "1:                                        \n"

+      "mov       %0, %1                          \n"

-    MEMACCESS(0)

-    "ld1       {v0.16b}, [%0], %5              \n"

-    MEMACCESS(0)

-    "ld1       {v1.16b}, [%0], %5              \n"

-    MEMACCESS(0)

-    "ld1       {v2.16b}, [%0], %5              \n"

-    MEMACCESS(0)

-    "ld1       {v3.16b}, [%0], %5              \n"

-    MEMACCESS(0)

-    "ld1       {v4.16b}, [%0], %5              \n"

-    MEMACCESS(0)

-    "ld1       {v5.16b}, [%0], %5              \n"

-    MEMACCESS(0)

-    "ld1       {v6.16b}, [%0], %5              \n"

-    MEMACCESS(0)

-    "ld1       {v7.16b}, [%0]                  \n"

+      "ld1       {v0.16b}, [%0], %5              \n"

+      "ld1       {v1.16b}, [%0], %5              \n"

+      "ld1       {v2.16b}, [%0], %5              \n"

+      "ld1       {v3.16b}, [%0], %5              \n"

+      "ld1       {v4.16b}, [%0], %5              \n"

+      "ld1       {v5.16b}, [%0], %5              \n"

+      "ld1       {v6.16b}, [%0], %5              \n"

+      "ld1       {v7.16b}, [%0]                  \n"

-    "trn1      v16.16b, v0.16b, v1.16b         \n"

-    "trn2      v17.16b, v0.16b, v1.16b         \n"

-    "trn1      v18.16b, v2.16b, v3.16b         \n"

-    "trn2      v19.16b, v2.16b, v3.16b         \n"

-    "trn1      v20.16b, v4.16b, v5.16b         \n"

-    "trn2      v21.16b, v4.16b, v5.16b         \n"

-    "trn1      v22.16b, v6.16b, v7.16b         \n"

-    "trn2      v23.16b, v6.16b, v7.16b         \n"

+      "trn1      v16.16b, v0.16b, v1.16b         \n"

+      "trn2      v17.16b, v0.16b, v1.16b         \n"

+      "trn1      v18.16b, v2.16b, v3.16b         \n"

+      "trn2      v19.16b, v2.16b, v3.16b         \n"

+      "trn1      v20.16b, v4.16b, v5.16b         \n"

+      "trn2      v21.16b, v4.16b, v5.16b         \n"

+      "trn1      v22.16b, v6.16b, v7.16b         \n"

+      "trn2      v23.16b, v6.16b, v7.16b         \n"

-    "trn1      v0.8h, v16.8h, v18.8h           \n"

-    "trn2      v1.8h, v16.8h, v18.8h           \n"

-    "trn1      v2.8h, v20.8h, v22.8h           \n"

-    "trn2      v3.8h, v20.8h, v22.8h           \n"

-    "trn1      v4.8h, v17.8h, v19.8h           \n"

-    "trn2      v5.8h, v17.8h, v19.8h           \n"

-    "trn1      v6.8h, v21.8h, v23.8h           \n"

-    "trn2      v7.8h, v21.8h, v23.8h           \n"

+      "trn1      v0.8h, v16.8h, v18.8h           \n"

+      "trn2      v1.8h, v16.8h, v18.8h           \n"

+      "trn1      v2.8h, v20.8h, v22.8h           \n"

+      "trn2      v3.8h, v20.8h, v22.8h           \n"

+      "trn1      v4.8h, v17.8h, v19.8h           \n"

+      "trn2      v5.8h, v17.8h, v19.8h           \n"

+      "trn1      v6.8h, v21.8h, v23.8h           \n"

+      "trn2      v7.8h, v21.8h, v23.8h           \n"

-    "trn1      v16.4s, v0.4s, v2.4s            \n"

-    "trn2      v17.4s, v0.4s, v2.4s            \n"

-    "trn1      v18.4s, v1.4s, v3.4s            \n"

-    "trn2      v19.4s, v1.4s, v3.4s            \n"

-    "trn1      v20.4s, v4.4s, v6.4s            \n"

-    "trn2      v21.4s, v4.4s, v6.4s            \n"

-    "trn1      v22.4s, v5.4s, v7.4s            \n"

-    "trn2      v23.4s, v5.4s, v7.4s            \n"

+      "trn1      v16.4s, v0.4s, v2.4s            \n"

+      "trn2      v17.4s, v0.4s, v2.4s            \n"

+      "trn1      v18.4s, v1.4s, v3.4s            \n"

+      "trn2      v19.4s, v1.4s, v3.4s            \n"

+      "trn1      v20.4s, v4.4s, v6.4s            \n"

+      "trn2      v21.4s, v4.4s, v6.4s            \n"

+      "trn1      v22.4s, v5.4s, v7.4s            \n"

+      "trn2      v23.4s, v5.4s, v7.4s            \n"

-    "mov       %0, %2                          \n"

+      "mov       %0, %2                          \n"

-    MEMACCESS(0)

-    "st1       {v16.d}[0], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v18.d}[0], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v17.d}[0], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v19.d}[0], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v16.d}[1], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v18.d}[1], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v17.d}[1], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v19.d}[1], [%0]                \n"

+      "st1       {v16.d}[0], [%0], %6            \n"

+      "st1       {v18.d}[0], [%0], %6            \n"

+      "st1       {v17.d}[0], [%0], %6            \n"

+      "st1       {v19.d}[0], [%0], %6            \n"

+      "st1       {v16.d}[1], [%0], %6            \n"

+      "st1       {v18.d}[1], [%0], %6            \n"

+      "st1       {v17.d}[1], [%0], %6            \n"

+      "st1       {v19.d}[1], [%0]                \n"

-    "mov       %0, %3                          \n"

+      "mov       %0, %3                          \n"

-    MEMACCESS(0)

-    "st1       {v20.d}[0], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v22.d}[0], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v21.d}[0], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v23.d}[0], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v20.d}[1], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v22.d}[1], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v21.d}[1], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v23.d}[1], [%0]                \n"

+      "st1       {v20.d}[0], [%0], %7            \n"

+      "st1       {v22.d}[0], [%0], %7            \n"

+      "st1       {v21.d}[0], [%0], %7            \n"

+      "st1       {v23.d}[0], [%0], %7            \n"

+      "st1       {v20.d}[1], [%0], %7            \n"

+      "st1       {v22.d}[1], [%0], %7            \n"

+      "st1       {v21.d}[1], [%0], %7            \n"

+      "st1       {v23.d}[1], [%0]                \n"

-    "add       %1, %1, #16                     \n"  // src   += 8*2

-    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a

-    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b

-    "subs      %4, %4,  #8                     \n"  // w     -= 8

-    "b.ge      1b                              \n"

+      "add       %1, %1, #16                     \n"  // src   += 8*2

+      "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 *

+                                                      // dst_stride_a

+      "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 *

+                                                      // dst_stride_b

+      "subs      %w4, %w4,  #8                   \n"  // w     -= 8

+      "b.ge      1b                              \n"

-    // add 8 back to counter. if the result is 0 there are

-    // no residuals.

-    "adds      %4, %4, #8                      \n"

-    "b.eq      4f                              \n"

+      // add 8 back to counter. if the result is 0 there are

+      // no residuals.

+      "adds      %w4, %w4, #8                    \n"

+      "b.eq      4f                              \n"

-    // some residual, so between 1 and 7 lines left to transpose

-    "cmp       %4, #2                          \n"

-    "b.lt      3f                              \n"

+      // some residual, so between 1 and 7 lines left to transpose

+      "cmp       %w4, #2                         \n"

+      "b.lt      3f                              \n"

-    "cmp       %4, #4                          \n"

-    "b.lt      2f                              \n"

+      "cmp       %w4, #4                         \n"

+      "b.lt      2f                              \n"

-    // TODO(frkoenig): Clean this up

-    // 4x8 block

-    "mov       %0, %1                          \n"

-    MEMACCESS(0)

-    "ld1       {v0.8b}, [%0], %5               \n"

-    MEMACCESS(0)

-    "ld1       {v1.8b}, [%0], %5               \n"

-    MEMACCESS(0)

-    "ld1       {v2.8b}, [%0], %5               \n"

-    MEMACCESS(0)

-    "ld1       {v3.8b}, [%0], %5               \n"

-    MEMACCESS(0)

-    "ld1       {v4.8b}, [%0], %5               \n"

-    MEMACCESS(0)

-    "ld1       {v5.8b}, [%0], %5               \n"

-    MEMACCESS(0)

-    "ld1       {v6.8b}, [%0], %5               \n"

-    MEMACCESS(0)

-    "ld1       {v7.8b}, [%0]                   \n"

+      // TODO(frkoenig): Clean this up

+      // 4x8 block

+      "mov       %0, %1                          \n"

+      "ld1       {v0.8b}, [%0], %5               \n"

+      "ld1       {v1.8b}, [%0], %5               \n"

+      "ld1       {v2.8b}, [%0], %5               \n"

+      "ld1       {v3.8b}, [%0], %5               \n"

+      "ld1       {v4.8b}, [%0], %5               \n"

+      "ld1       {v5.8b}, [%0], %5               \n"

+      "ld1       {v6.8b}, [%0], %5               \n"

+      "ld1       {v7.8b}, [%0]                   \n"

-    MEMACCESS(8)

-    "ld1       {v30.16b}, [%8], #16            \n"

-    "ld1       {v31.16b}, [%8]                 \n"

+      "ld1       {v30.16b}, [%8], #16            \n"

+      "ld1       {v31.16b}, [%8]                 \n"

-    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"

-    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"

-    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"

-    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"

+      "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"

+      "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"

+      "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"

+      "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"

-    "mov       %0, %2                          \n"

+      "mov       %0, %2                          \n"

-    MEMACCESS(0)

-    "st1       {v16.s}[0],  [%0], %6           \n"

-    MEMACCESS(0)

-    "st1       {v16.s}[1],  [%0], %6           \n"

-    MEMACCESS(0)

-    "st1       {v16.s}[2],  [%0], %6           \n"

-    MEMACCESS(0)

-    "st1       {v16.s}[3],  [%0], %6           \n"

+      "st1       {v16.s}[0],  [%0], %6           \n"

+      "st1       {v16.s}[1],  [%0], %6           \n"

+      "st1       {v16.s}[2],  [%0], %6           \n"

+      "st1       {v16.s}[3],  [%0], %6           \n"

-    "add       %0, %2, #4                      \n"

-    MEMACCESS(0)

-    "st1       {v18.s}[0], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v18.s}[1], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v18.s}[2], [%0], %6            \n"

-    MEMACCESS(0)

-    "st1       {v18.s}[3], [%0]                \n"

+      "add       %0, %2, #4                      \n"

+      "st1       {v18.s}[0], [%0], %6            \n"

+      "st1       {v18.s}[1], [%0], %6            \n"

+      "st1       {v18.s}[2], [%0], %6            \n"

+      "st1       {v18.s}[3], [%0]                \n"

-    "mov       %0, %3                          \n"

+      "mov       %0, %3                          \n"

-    MEMACCESS(0)

-    "st1       {v17.s}[0], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v17.s}[1], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v17.s}[2], [%0], %7            \n"

-    MEMACCESS(0)

-    "st1       {v17.s}[3], [%0], %7            \n"

+      "st1       {v17.s}[0], [%0], %7            \n"

+      "st1       {v17.s}[1], [%0], %7            \n"

+      "st1       {v17.s}[2], [%0], %7            \n"

+      "st1       {v17.s}[3], [%0], %7            \n"

-    "add       %0, %3, #4                      \n"

-    MEMACCESS(0)

-    "st1       {v19.s}[0],  [%0], %7           \n"

-    MEMACCESS(0)

-    "st1       {v19.s}[1],  [%0], %7           \n"

-    MEMACCESS(0)

-    "st1       {v19.s}[2],  [%0], %7           \n"

-    MEMACCESS(0)

-    "st1       {v19.s}[3],  [%0]               \n"

+      "add       %0, %3, #4                      \n"

+      "st1       {v19.s}[0],  [%0], %7           \n"

+      "st1       {v19.s}[1],  [%0], %7           \n"

+      "st1       {v19.s}[2],  [%0], %7           \n"

+      "st1       {v19.s}[3],  [%0]               \n"

-    "add       %1, %1, #8                      \n"  // src   += 4 * 2

-    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a

-    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b

-    "subs      %4,  %4,  #4                    \n"  // w     -= 4

-    "b.eq      4f                              \n"

+      "add       %1, %1, #8                      \n"  // src   += 4 * 2

+      "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 *

+                                                      // dst_stride_a

+      "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 *

+                                                      // dst_stride_b

+      "subs      %w4,  %w4,  #4                  \n"  // w     -= 4

+      "b.eq      4f                              \n"

-    // some residual, check to see if it includes a 2x8 block,

-    // or less

-    "cmp       %4, #2                          \n"

-    "b.lt      3f                              \n"

+      // some residual, check to see if it includes a 2x8 block,

+      // or less

+      "cmp       %w4, #2                         \n"

+      "b.lt      3f                              \n"

-    // 2x8 block

-    "2:                                        \n"

-    "mov       %0, %1                          \n"

-    MEMACCESS(0)

-    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"

-    MEMACCESS(0)

-    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"

-    MEMACCESS(0)

-    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"

-    MEMACCESS(0)

-    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"

-    MEMACCESS(0)

-    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"

-    MEMACCESS(0)

-    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"

-    MEMACCESS(0)

-    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"

-    MEMACCESS(0)

-    "ld2       {v2.h, v3.h}[3], [%0]           \n"

+      // 2x8 block

+      "2:                                        \n"

+      "mov       %0, %1                          \n"

+      "ld2       {v0.h, v1.h}[0], [%0], %5       \n"

+      "ld2       {v2.h, v3.h}[0], [%0], %5       \n"

+      "ld2       {v0.h, v1.h}[1], [%0], %5       \n"

+      "ld2       {v2.h, v3.h}[1], [%0], %5       \n"

+      "ld2       {v0.h, v1.h}[2], [%0], %5       \n"

+      "ld2       {v2.h, v3.h}[2], [%0], %5       \n"

+      "ld2       {v0.h, v1.h}[3], [%0], %5       \n"

+      "ld2       {v2.h, v3.h}[3], [%0]           \n"

-    "trn1      v4.8b, v0.8b, v2.8b             \n"

-    "trn2      v5.8b, v0.8b, v2.8b             \n"

-    "trn1      v6.8b, v1.8b, v3.8b             \n"

-    "trn2      v7.8b, v1.8b, v3.8b             \n"

+      "trn1      v4.8b, v0.8b, v2.8b             \n"

+      "trn2      v5.8b, v0.8b, v2.8b             \n"

+      "trn1      v6.8b, v1.8b, v3.8b             \n"

+      "trn2      v7.8b, v1.8b, v3.8b             \n"

-    "mov       %0, %2                          \n"

+      "mov       %0, %2                          \n"

-    MEMACCESS(0)

-    "st1       {v4.d}[0], [%0], %6             \n"

-    MEMACCESS(0)

-    "st1       {v6.d}[0], [%0]                 \n"

+      "st1       {v4.d}[0], [%0], %6             \n"

+      "st1       {v6.d}[0], [%0]                 \n"

-    "mov       %0, %3                          \n"

+      "mov       %0, %3                          \n"

-    MEMACCESS(0)

-    "st1       {v5.d}[0], [%0], %7             \n"

-    MEMACCESS(0)

-    "st1       {v7.d}[0], [%0]                 \n"

+      "st1       {v5.d}[0], [%0], %7             \n"

+      "st1       {v7.d}[0], [%0]                 \n"

-    "add       %1, %1, #4                      \n"  // src   += 2 * 2

-    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a

-    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b

-    "subs      %4,  %4,  #2                    \n"  // w     -= 2

-    "b.eq      4f                              \n"

+      "add       %1, %1, #4                      \n"  // src   += 2 * 2

+      "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 *

+                                                      // dst_stride_a

+      "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 *

+                                                      // dst_stride_b

+      "subs      %w4,  %w4,  #2                  \n"  // w     -= 2

+      "b.eq      4f                              \n"

-    // 1x8 block

-    "3:                                        \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"

-    MEMACCESS(1)

-    "ld2       {v0.b, v1.b}[7], [%1]           \n"

+      // 1x8 block

+      "3:                                        \n"

+      "ld2       {v0.b, v1.b}[0], [%1], %5       \n"

+      "ld2       {v0.b, v1.b}[1], [%1], %5       \n"

+      "ld2       {v0.b, v1.b}[2], [%1], %5       \n"

+      "ld2       {v0.b, v1.b}[3], [%1], %5       \n"

+      "ld2       {v0.b, v1.b}[4], [%1], %5       \n"

+      "ld2       {v0.b, v1.b}[5], [%1], %5       \n"

+      "ld2       {v0.b, v1.b}[6], [%1], %5       \n"

+      "ld2       {v0.b, v1.b}[7], [%1]           \n"

-    MEMACCESS(2)

-    "st1       {v0.d}[0], [%2]                 \n"

-    MEMACCESS(3)

-    "st1       {v1.d}[0], [%3]                 \n"

+      "st1       {v0.d}[0], [%2]                 \n"

+      "st1       {v1.d}[0], [%3]                 \n"

-    "4:                                        \n"

+      "4:                                        \n"

-    : "=&r"(src_temp),                            // %0

-      "+r"(src),                                  // %1

-      "+r"(dst_a),                                // %2

-      "+r"(dst_b),                                // %3

-      "+r"(width64)                               // %4

-    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5

-      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6

-      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7

-      "r"(&kVTbl4x4TransposeDi)                   // %8

-    : "memory", "cc",

-      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

-      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",

-      "v30", "v31"

-  );

+      : "=&r"(src_temp),                            // %0

+        "+r"(src),                                  // %1

+        "+r"(dst_a),                                // %2

+        "+r"(dst_b),                                // %3

+        "+r"(width)                                 // %4

+      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5

+        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6

+        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7

+        "r"(&kVTbl4x4TransposeDi)                   // %8

+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",

+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");

 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

--- a/third_party/libyuv/source/rotate_win.cc

+++ b/third_party/libyuv/source/rotate_win.cc

@@ -8,8 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "libyuv/row.h"

 #include "libyuv/rotate_row.h"

+#include "libyuv/row.h"

 #ifdef __cplusplus

 namespace libyuv {

@@ -17,17 +17,19 @@

 #endif

 // This module is for 32 bit Visual C x86 and clangcl

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

-__declspec(naked)

-void TransposeWx8_SSSE3(const uint8* src, int src_stride,

-                        uint8* dst, int dst_stride, int width) {

+__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,

+                                          int src_stride,

+                                          uint8_t* dst,

+                                          int dst_stride,

+                                          int width) {

   __asm {

     push      edi

     push      esi

     push      ebp

-    mov       eax, [esp + 12 + 4]   // src

-    mov       edi, [esp + 12 + 8]   // src_stride

+    mov       eax, [esp + 12 + 4]  // src

+    mov       edi, [esp + 12 + 8]  // src_stride

     mov       edx, [esp + 12 + 12]  // dst

     mov       esi, [esp + 12 + 16]  // dst_stride

     mov       ecx, [esp + 12 + 20]  // width

@@ -110,18 +112,20 @@

-__declspec(naked)

-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,

-                         uint8* dst_a, int dst_stride_a,

-                         uint8* dst_b, int dst_stride_b,

-                         int w) {

+__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,

+                                           int src_stride,

+                                           uint8_t* dst_a,

+                                           int dst_stride_a,

+                                           uint8_t* dst_b,

+                                           int dst_stride_b,

+                                           int w) {

   __asm {

     push      ebx

     push      esi

     push      edi

     push      ebp

-    mov       eax, [esp + 16 + 4]   // src

-    mov       edi, [esp + 16 + 8]   // src_stride

+    mov       eax, [esp + 16 + 4]  // src

+    mov       edi, [esp + 16 + 8]  // src_stride

     mov       edx, [esp + 16 + 12]  // dst_a

     mov       esi, [esp + 16 + 16]  // dst_stride_a

     mov       ebx, [esp + 16 + 20]  // dst_b

@@ -133,9 +137,9 @@

     mov       ecx, [ecx + 16 + 28]  // w

     align      4

- convertloop:

     // Read in the data from the source pointer.

     // First round of bit swap.

+  convertloop:

     movdqu    xmm0, [eax]

     movdqu    xmm1, [eax + edi]

     lea       eax, [eax + 2 * edi]

@@ -162,13 +166,13 @@

     lea       eax, [eax + 2 * edi]

     movdqu    [esp], xmm5  // backup xmm5

     neg       edi

-    movdqa    xmm5, xmm6   // use xmm5 as temp register.

+    movdqa    xmm5, xmm6  // use xmm5 as temp register.

     punpcklbw xmm6, xmm7

     punpckhbw xmm5, xmm7

     movdqa    xmm7, xmm5

     lea       eax, [eax + 8 * edi + 16]

     neg       edi

-    // Second round of bit swap.

+        // Second round of bit swap.

     movdqa    xmm5, xmm0

     punpcklwd xmm0, xmm2

     punpckhwd xmm5, xmm2

@@ -183,12 +187,13 @@

     movdqa    xmm6, xmm5

     movdqu    xmm5, [esp]  // restore xmm5

     movdqu    [esp], xmm6  // backup xmm6

-    movdqa    xmm6, xmm5    // use xmm6 as temp register.

+    movdqa    xmm6, xmm5  // use xmm6 as temp register.

     punpcklwd xmm5, xmm7

     punpckhwd xmm6, xmm7

     movdqa    xmm7, xmm6

-    // Third round of bit swap.

-    // Write to the destination pointer.

+        // Third round of bit swap.

+        // Write to the destination pointer.

     movdqa    xmm6, xmm0

     punpckldq xmm0, xmm4

     punpckhdq xmm6, xmm4

@@ -200,7 +205,7 @@

     lea       edx, [edx + 2 * esi]

     movhpd    qword ptr [ebx + ebp], xmm4

     lea       ebx, [ebx + 2 * ebp]

-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.

+    movdqa    xmm0, xmm2  // use xmm0 as the temp register.

     punpckldq xmm2, xmm6

     movlpd    qword ptr [edx], xmm2

     movhpd    qword ptr [ebx], xmm2

@@ -209,7 +214,7 @@

     lea       edx, [edx + 2 * esi]

     movhpd    qword ptr [ebx + ebp], xmm0

     lea       ebx, [ebx + 2 * ebp]

-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.

+    movdqa    xmm0, xmm1  // use xmm0 as the temp register.

     punpckldq xmm1, xmm5

     movlpd    qword ptr [edx], xmm1

     movhpd    qword ptr [ebx], xmm1

@@ -218,7 +223,7 @@

     lea       edx, [edx + 2 * esi]

     movhpd    qword ptr [ebx + ebp], xmm0

     lea       ebx, [ebx + 2 * ebp]

-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.

+    movdqa    xmm0, xmm3  // use xmm0 as the temp register.

     punpckldq xmm3, xmm7

     movlpd    qword ptr [edx], xmm3

     movhpd    qword ptr [ebx], xmm3

--- a/third_party/libyuv/source/row_any.cc

+++ b/third_party/libyuv/source/row_any.cc

@@ -19,30 +19,38 @@

 extern "C" {

 #endif

+// memset for temp is meant to clear the source buffer (not dest) so that

+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.

+// memset is not needed for production, as the garbage values are processed but

+// not used, although there may be edge cases for subsampling.

+// The size of the buffer is based on the largest read, which can be inferred

+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining

+// the source code for how much the source pointers are advanced.

 // Subsampled source needs to be increase by 1 of not even.

 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))

 // Any 4 planes to 1 with yuvconstants

-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \

-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \

-                 const uint8* a_buf, uint8* dst_ptr,                           \

-                 const struct YuvConstants* yuvconstants,  int width) {        \

-      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \

-      memset(temp, 0, 64 * 4);  /* for msan */                                 \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \

-      }                                                                        \

-      memcpy(temp, y_buf + n, r);                                              \

-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \

-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \

-      memcpy(temp + 192, a_buf + n, r);                                        \

-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \

-               yuvconstants, MASK + 1);                                        \

-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \

-             SS(r, DUVSHIFT) * BPP);                                           \

-    }

+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \

+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \

+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \

+               const struct YuvConstants* yuvconstants, int width) {         \

+    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \

+    memset(temp, 0, 64 * 4); /* for msan */                                  \

+    int r = width & MASK;                                                    \

+    int n = width & ~MASK;                                                   \

+    if (n > 0) {                                                             \

+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \

+    }                                                                        \

+    memcpy(temp, y_buf + n, r);                                              \

+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \

+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \

+    memcpy(temp + 192, a_buf + n, r);                                        \

+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \

+             yuvconstants, MASK + 1);                                        \

+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \

+           SS(r, DUVSHIFT) * BPP);                                           \

+  }

 #ifdef HAS_I422ALPHATOARGBROW_SSSE3

 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)

@@ -53,36 +61,57 @@

 #ifdef HAS_I422ALPHATOARGBROW_NEON

 ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)

 #endif

+#ifdef HAS_I422ALPHATOARGBROW_MSA

+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)

+#endif

 #undef ANY41C

 // Any 3 planes to 1.

-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \

-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \

-                 uint8* dst_ptr, int width) {                                  \

-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \

-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \

-      }                                                                        \

-      memcpy(temp, y_buf + n, r);                                              \

-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \

-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \

-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \

-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \

-             SS(r, DUVSHIFT) * BPP);                                           \

-    }

+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \

+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \

+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \

+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \

+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \

+    int r = width & MASK;                                           \

+    int n = width & ~MASK;                                          \

+    if (n > 0) {                                                    \

+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \

+    }                                                               \

+    memcpy(temp, y_buf + n, r);                                     \

+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \

+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \

+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \

+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \

+           SS(r, DUVSHIFT) * BPP);                                  \

+  }

+// Merge functions.

+#ifdef HAS_MERGERGBROW_SSSE3

+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)

+#endif

+#ifdef HAS_MERGERGBROW_NEON

+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)

+#endif

 #ifdef HAS_I422TOYUY2ROW_SSE2

 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)

 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)

 #endif

+#ifdef HAS_I422TOYUY2ROW_AVX2

+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)

+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)

+#endif

 #ifdef HAS_I422TOYUY2ROW_NEON

 ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)

 #endif

+#ifdef HAS_I422TOYUY2ROW_MSA

+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)

+#endif

 #ifdef HAS_I422TOUYVYROW_NEON

 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)

 #endif

+#ifdef HAS_I422TOUYVYROW_MSA

+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)

+#endif

 #ifdef HAS_BLENDPLANEROW_AVX2

 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)

 #endif

@@ -94,36 +123,39 @@

 // Note that odd width replication includes 444 due to implementation

 // on arm that subsamples 444 to 422 internally.

 // Any 3 planes to 1 with yuvconstants

-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \

-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \

-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \

-                 int width) {                                                  \

-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \

-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \

-      }                                                                        \

-      memcpy(temp, y_buf + n, r);                                              \

-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \

-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \

-      if (width & 1) {                                                         \

-        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \

-        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \

-      }                                                                        \

-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \

-               yuvconstants, MASK + 1);                                        \

-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \

-             SS(r, DUVSHIFT) * BPP);                                           \

-    }

+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \

+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \

+               const uint8_t* v_buf, uint8_t* dst_ptr,               \

+               const struct YuvConstants* yuvconstants, int width) { \

+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \

+    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \

+    int r = width & MASK;                                            \

+    int n = width & ~MASK;                                           \

+    if (n > 0) {                                                     \

+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \

+    }                                                                \

+    memcpy(temp, y_buf + n, r);                                      \

+    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \

+    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \

+    if (width & 1) {                                                 \

+      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \

+      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \

+    }                                                                \

+    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \

+             MASK + 1);                                              \

+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \

+           SS(r, DUVSHIFT) * BPP);                                   \

+  }

 #ifdef HAS_I422TOARGBROW_SSSE3

 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)

 #endif

-#ifdef HAS_I411TOARGBROW_SSSE3

-ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)

+#ifdef HAS_I422TOAR30ROW_SSSE3

+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)

 #endif

+#ifdef HAS_I422TOAR30ROW_AVX2

+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)

+#endif

 #ifdef HAS_I444TOARGBROW_SSSE3

 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)

 ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)

@@ -130,10 +162,10 @@

 ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)

 ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)

 ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)

-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)

+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)

 #endif  // HAS_I444TOARGBROW_SSSE3

 #ifdef HAS_I422TORGB24ROW_AVX2

-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)

+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)

 #endif

 #ifdef HAS_I422TOARGBROW_AVX2

 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)

@@ -144,22 +176,18 @@

 #ifdef HAS_I444TOARGBROW_AVX2

 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)

 #endif

-#ifdef HAS_I411TOARGBROW_AVX2

-ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)

-#endif

 #ifdef HAS_I422TOARGB4444ROW_AVX2

-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)

+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)

 #endif

 #ifdef HAS_I422TOARGB1555ROW_AVX2

-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)

+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)

 #endif

 #ifdef HAS_I422TORGB565ROW_AVX2

-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)

+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)

 #endif

 #ifdef HAS_I422TOARGBROW_NEON

 ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)

 ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)

-ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)

 ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)

 ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)

 ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)

@@ -166,25 +194,69 @@

 ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)

 ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)

 #endif

+#ifdef HAS_I422TOARGBROW_MSA

+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)

+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)

+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)

+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)

+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)

+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)

+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)

+#endif

 #undef ANY31C

+// Any 3 planes of 16 bit to 1 with yuvconstants

+// TODO(fbarchard): consider sharing this code with ANY31C

+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \

+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \

+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \

+               int width) {                                               \

+    SIMD_ALIGNED(T temp[16 * 3]);                                         \

+    SIMD_ALIGNED(uint8_t out[64]);                                        \

+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \

+    int r = width & MASK;                                                 \

+    int n = width & ~MASK;                                                \

+    if (n > 0) {                                                          \

+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \

+    }                                                                     \

+    memcpy(temp, y_buf + n, r * SBPP);                                    \

+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \

+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \

+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \

+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \

+  }

+#ifdef HAS_I210TOAR30ROW_SSSE3

+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)

+#endif

+#ifdef HAS_I210TOARGBROW_SSSE3

+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)

+#endif

+#ifdef HAS_I210TOARGBROW_AVX2

+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)

+#endif

+#ifdef HAS_I210TOAR30ROW_AVX2

+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)

+#endif

+#undef ANY31CT

 // Any 2 planes to 1.

-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \

-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \

-                 uint8* dst_ptr, int width) {                                  \

-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \

-      memset(temp, 0, 64 * 2);  /* for msan */                                 \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \

-      }                                                                        \

-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \

-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \

-             SS(r, UVSHIFT) * SBPP2);                                          \

-      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \

-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

-    }

+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \

+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \

+               int width) {                                                   \

+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \

+    memset(temp, 0, 64 * 2); /* for msan */                                   \

+    int r = width & MASK;                                                     \

+    int n = width & ~MASK;                                                    \

+    if (n > 0) {                                                              \

+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \

+    }                                                                         \

+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \

+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \

+           SS(r, UVSHIFT) * SBPP2);                                           \

+    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \

+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \

+  }

 // Merge functions.

 #ifdef HAS_MERGEUVROW_SSE2

@@ -196,6 +268,9 @@

 #ifdef HAS_MERGEUVROW_NEON

 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)

 #endif

+#ifdef HAS_MERGEUVROW_MSA

+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)

+#endif

 // Math functions.

 #ifdef HAS_ARGBMULTIPLYROW_SSE2

@@ -225,6 +300,15 @@

 #ifdef HAS_ARGBSUBTRACTROW_NEON

 ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)

 #endif

+#ifdef HAS_ARGBMULTIPLYROW_MSA

+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)

+#endif

+#ifdef HAS_ARGBADDROW_MSA

+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)

+#endif

+#ifdef HAS_ARGBSUBTRACTROW_MSA

+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)

+#endif

 #ifdef HAS_SOBELROW_SSE2

 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)

 #endif

@@ -231,6 +315,9 @@

 #ifdef HAS_SOBELROW_NEON

 ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)

 #endif

+#ifdef HAS_SOBELROW_MSA

+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)

+#endif

 #ifdef HAS_SOBELTOPLANEROW_SSE2

 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)

 #endif

@@ -237,6 +324,9 @@

 #ifdef HAS_SOBELTOPLANEROW_NEON

 ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)

 #endif

+#ifdef HAS_SOBELTOPLANEROW_MSA

+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)

+#endif

 #ifdef HAS_SOBELXYROW_SSE2

 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)

 #endif

@@ -243,26 +333,28 @@

 #ifdef HAS_SOBELXYROW_NEON

 ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)

 #endif

+#ifdef HAS_SOBELXYROW_MSA

+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)

+#endif

 #undef ANY21

 // Any 2 planes to 1 with yuvconstants

-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \

-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \

-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \

-                 int width) {                                                  \

-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \

-      memset(temp, 0, 64 * 2);  /* for msan */                                 \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \

-      }                                                                        \

-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \

-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \

-             SS(r, UVSHIFT) * SBPP2);                                          \

-      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \

-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

-    }

+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \

+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \

+               const struct YuvConstants* yuvconstants, int width) {          \

+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \

+    memset(temp, 0, 128 * 2); /* for msan */                                  \

+    int r = width & MASK;                                                     \

+    int n = width & ~MASK;                                                    \

+    if (n > 0) {                                                              \

+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \

+    }                                                                         \

+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \

+    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \

+           SS(r, UVSHIFT) * SBPP2);                                           \

+    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \

+    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \

+  }

 // Biplanar to RGB.

 #ifdef HAS_NV12TOARGBROW_SSSE3

@@ -274,6 +366,9 @@

 #ifdef HAS_NV12TOARGBROW_NEON

 ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)

 #endif

+#ifdef HAS_NV12TOARGBROW_MSA

+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)

+#endif

 #ifdef HAS_NV21TOARGBROW_SSSE3

 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)

 #endif

@@ -283,6 +378,27 @@

 #ifdef HAS_NV21TOARGBROW_NEON

 ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)

 #endif

+#ifdef HAS_NV21TOARGBROW_MSA

+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)

+#endif

+#ifdef HAS_NV12TORGB24ROW_NEON

+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)

+#endif

+#ifdef HAS_NV21TORGB24ROW_NEON

+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)

+#endif

+#ifdef HAS_NV12TORGB24ROW_SSSE3

+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)

+#endif

+#ifdef HAS_NV21TORGB24ROW_SSSE3

+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)

+#endif

+#ifdef HAS_NV12TORGB24ROW_AVX2

+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)

+#endif

+#ifdef HAS_NV21TORGB24ROW_AVX2

+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)

+#endif

 #ifdef HAS_NV12TORGB565ROW_SSSE3

 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)

 #endif

@@ -292,22 +408,25 @@

 #ifdef HAS_NV12TORGB565ROW_NEON

 ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)

 #endif

+#ifdef HAS_NV12TORGB565ROW_MSA

+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)

+#endif

 #undef ANY21C

 // Any 1 to 1.

-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \

-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \

-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \

-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \

-      }                                                                        \

-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \

-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \

-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

-    }

+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \

+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \

+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \

+    memset(temp, 0, 128); /* for YUY2 and msan */                         \

+    int r = width & MASK;                                                 \

+    int n = width & ~MASK;                                                \

+    if (n > 0) {                                                          \

+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \

+    }                                                                     \

+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \

+    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \

+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \

+  }

 #ifdef HAS_COPYROW_AVX

 ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)

@@ -325,6 +444,15 @@

 ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)

 ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)

 #endif

+#if defined(HAS_ARGBTORGB24ROW_AVX2)

+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)

+#endif

+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)

+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)

+#endif

+#if defined(HAS_ARGBTORAWROW_AVX2)

+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)

+#endif

 #if defined(HAS_ARGBTORGB565ROW_AVX2)

 ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)

 #endif

@@ -332,6 +460,18 @@

 ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)

 ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)

 #endif

+#if defined(HAS_ABGRTOAR30ROW_SSSE3)

+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)

+#endif

+#if defined(HAS_ARGBTOAR30ROW_SSSE3)

+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)

+#endif

+#if defined(HAS_ABGRTOAR30ROW_AVX2)

+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)

+#endif

+#if defined(HAS_ARGBTOAR30ROW_AVX2)

+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)

+#endif

 #if defined(HAS_J400TOARGBROW_SSE2)

 ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)

 #endif

@@ -372,9 +512,21 @@

 ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)

 ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)

 #endif

+#if defined(HAS_ARGBTORGB24ROW_MSA)

+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)

+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)

+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)

+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)

+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)

+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)

+ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)

+#endif

 #if defined(HAS_RAWTORGB24ROW_NEON)

 ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)

 #endif

+#if defined(HAS_RAWTORGB24ROW_MSA)

+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)

+#endif

 #ifdef HAS_ARGBTOYROW_AVX2

 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)

 #endif

@@ -403,30 +555,57 @@

 #ifdef HAS_ARGBTOYROW_NEON

 ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)

 #endif

+#ifdef HAS_ARGBTOYROW_MSA

+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)

+#endif

 #ifdef HAS_ARGBTOYJROW_NEON

 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)

 #endif

+#ifdef HAS_ARGBTOYJROW_MSA

+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)

+#endif

 #ifdef HAS_BGRATOYROW_NEON

 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)

 #endif

+#ifdef HAS_BGRATOYROW_MSA

+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)

+#endif

 #ifdef HAS_ABGRTOYROW_NEON

 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)

 #endif

+#ifdef HAS_ABGRTOYROW_MSA

+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)

+#endif

 #ifdef HAS_RGBATOYROW_NEON

 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)

 #endif

+#ifdef HAS_RGBATOYROW_MSA

+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)

+#endif

 #ifdef HAS_RGB24TOYROW_NEON

 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)

 #endif

+#ifdef HAS_RGB24TOYROW_MSA

+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)

+#endif

 #ifdef HAS_RAWTOYROW_NEON

 ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)

 #endif

+#ifdef HAS_RAWTOYROW_MSA

+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)

+#endif

 #ifdef HAS_RGB565TOYROW_NEON

 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)

 #endif

+#ifdef HAS_RGB565TOYROW_MSA

+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)

+#endif

 #ifdef HAS_ARGB1555TOYROW_NEON

 ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)

 #endif

+#ifdef HAS_ARGB1555TOYROW_MSA

+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)

+#endif

 #ifdef HAS_ARGB4444TOYROW_NEON

 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)

 #endif

@@ -434,23 +613,44 @@

 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)

 #endif

 #ifdef HAS_UYVYTOYROW_NEON

-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)

+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)

 #endif

+#ifdef HAS_YUY2TOYROW_MSA

+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)

+#endif

+#ifdef HAS_UYVYTOYROW_MSA

+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)

+#endif

 #ifdef HAS_RGB24TOARGBROW_NEON

 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)

 #endif

+#ifdef HAS_RGB24TOARGBROW_MSA

+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)

+#endif

 #ifdef HAS_RAWTOARGBROW_NEON

 ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)

 #endif

+#ifdef HAS_RAWTOARGBROW_MSA

+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)

+#endif

 #ifdef HAS_RGB565TOARGBROW_NEON

 ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)

 #endif

+#ifdef HAS_RGB565TOARGBROW_MSA

+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)

+#endif

 #ifdef HAS_ARGB1555TOARGBROW_NEON

 ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)

 #endif

+#ifdef HAS_ARGB1555TOARGBROW_MSA

+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)

+#endif

 #ifdef HAS_ARGB4444TOARGBROW_NEON

 ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)

 #endif

+#ifdef HAS_ARGB4444TOARGBROW_MSA

+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)

+#endif

 #ifdef HAS_ARGBATTENUATEROW_SSSE3

 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)

 #endif

@@ -466,29 +666,38 @@

 #ifdef HAS_ARGBATTENUATEROW_NEON

 ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)

 #endif

+#ifdef HAS_ARGBATTENUATEROW_MSA

+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)

+#endif

 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2

 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)

 #endif

+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2

+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)

+#endif

 #ifdef HAS_ARGBEXTRACTALPHAROW_NEON

 ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)

 #endif

+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA

+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)

+#endif

 #undef ANY11

 // Any 1 to 1 blended.  Destination is read, modify, write.

-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \

-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \

-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \

-      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \

-      }                                                                        \

-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \

-      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \

-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \

-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

-    }

+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \

+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \

+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \

+    memset(temp, 0, 64 * 2); /* for msan */                               \

+    int r = width & MASK;                                                 \

+    int n = width & ~MASK;                                                \

+    if (n > 0) {                                                          \

+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \

+    }                                                                     \

+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \

+    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \

+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \

+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \

+  }

 #ifdef HAS_ARGBCOPYALPHAROW_AVX2

 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)

@@ -506,61 +715,184 @@

 // Any 1 to 1 with parameter.

 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \

-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \

-                 T shuffler, int width) {                                      \

-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \

-      memset(temp, 0, 64);  /* for msan */                                     \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \

-      }                                                                        \

-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \

-      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \

-      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \

-    }

+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \

+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \

+    memset(temp, 0, 64); /* for msan */                                        \

+    int r = width & MASK;                                                      \

+    int n = width & ~MASK;                                                     \

+    if (n > 0) {                                                               \

+      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \

+    }                                                                          \

+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \

+    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \

+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \

+  }

 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)

-ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,

-       const uint32, 4, 2, 3)

+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,

+       ARGBToRGB565DitherRow_SSE2,

+       const uint32_t,

+       4,

+       2,

+       3)

 #endif

 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)

-ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,

-       const uint32, 4, 2, 7)

+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,

+       ARGBToRGB565DitherRow_AVX2,

+       const uint32_t,

+       4,

+       2,

+       7)

 #endif

 #if defined(HAS_ARGBTORGB565DITHERROW_NEON)

-ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,

-       const uint32, 4, 2, 7)

+ANY11P(ARGBToRGB565DitherRow_Any_NEON,

+       ARGBToRGB565DitherRow_NEON,

+       const uint32_t,

+       4,

+       2,

+       7)

 #endif

-#ifdef HAS_ARGBSHUFFLEROW_SSE2

-ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)

+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)

+ANY11P(ARGBToRGB565DitherRow_Any_MSA,

+       ARGBToRGB565DitherRow_MSA,

+       const uint32_t,

+       4,

+       2,

+       7)

 #endif

 #ifdef HAS_ARGBSHUFFLEROW_SSSE3

-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)

+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)

 #endif

 #ifdef HAS_ARGBSHUFFLEROW_AVX2

-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)

+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)

 #endif

 #ifdef HAS_ARGBSHUFFLEROW_NEON

-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)

+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)

 #endif

+#ifdef HAS_ARGBSHUFFLEROW_MSA

+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)

+#endif

 #undef ANY11P

+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.

+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \

+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \

+    SIMD_ALIGNED(STYPE temp[32]);                                            \

+    SIMD_ALIGNED(DTYPE out[32]);                                             \

+    memset(temp, 0, 32 * SBPP); /* for msan */                               \

+    int r = width & MASK;                                                    \

+    int n = width & ~MASK;                                                   \

+    if (n > 0) {                                                             \

+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \

+    }                                                                        \

+    memcpy(temp, src_ptr + n, r * SBPP);                                     \

+    ANY_SIMD(temp, out, scale, MASK + 1);                                    \

+    memcpy(dst_ptr + n, out, r * BPP);                                       \

+  }

+#ifdef HAS_CONVERT16TO8ROW_SSSE3

+ANY11C(Convert16To8Row_Any_SSSE3,

+       Convert16To8Row_SSSE3,

+       2,

+       1,

+       uint16_t,

+       uint8_t,

+       15)

+#endif

+#ifdef HAS_CONVERT16TO8ROW_AVX2

+ANY11C(Convert16To8Row_Any_AVX2,

+       Convert16To8Row_AVX2,

+       2,

+       1,

+       uint16_t,

+       uint8_t,

+       31)

+#endif

+#ifdef HAS_CONVERT8TO16ROW_SSE2

+ANY11C(Convert8To16Row_Any_SSE2,

+       Convert8To16Row_SSE2,

+       1,

+       2,

+       uint8_t,

+       uint16_t,

+       15)

+#endif

+#ifdef HAS_CONVERT8TO16ROW_AVX2

+ANY11C(Convert8To16Row_Any_AVX2,

+       Convert8To16Row_AVX2,

+       1,

+       2,

+       uint8_t,

+       uint16_t,

+       31)

+#endif

+#undef ANY11C

+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.

+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \

+  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \

+    SIMD_ALIGNED(ST temp[32]);                                          \

+    SIMD_ALIGNED(T out[32]);                                            \

+    memset(temp, 0, SBPP * 32); /* for msan */                          \

+    int r = width & MASK;                                               \

+    int n = width & ~MASK;                                              \

+    if (n > 0) {                                                        \

+      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \

+    }                                                                   \

+    memcpy(temp, src_ptr + n, r * SBPP);                                \

+    ANY_SIMD(temp, out, param, MASK + 1);                               \

+    memcpy(dst_ptr + n, out, r * BPP);                                  \

+  }

+#ifdef HAS_HALFFLOATROW_SSE2

+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)

+#endif

+#ifdef HAS_HALFFLOATROW_AVX2

+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)

+#endif

+#ifdef HAS_HALFFLOATROW_F16C

+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)

+ANY11P16(HalfFloat1Row_Any_F16C,

+         HalfFloat1Row_F16C,

+         uint16_t,

+         uint16_t,

+         2,

+         2,

+         15)

+#endif

+#ifdef HAS_HALFFLOATROW_NEON

+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)

+ANY11P16(HalfFloat1Row_Any_NEON,

+         HalfFloat1Row_NEON,

+         uint16_t,

+         uint16_t,

+         2,

+         2,

+         7)

+#endif

+#ifdef HAS_HALFFLOATROW_MSA

+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)

+#endif

+#ifdef HAS_BYTETOFLOATROW_NEON

+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)

+#endif

+#undef ANY11P16

 // Any 1 to 1 with yuvconstants

-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \

-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \

-                 const struct YuvConstants* yuvconstants, int width) {         \

-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \

-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \

-      }                                                                        \

-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \

-      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \

-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

-    }

+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \

+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \

+               const struct YuvConstants* yuvconstants, int width) {      \

+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \

+    memset(temp, 0, 128); /* for YUY2 and msan */                         \

+    int r = width & MASK;                                                 \

+    int n = width & ~MASK;                                                \

+    if (n > 0) {                                                          \

+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \

+    }                                                                     \

+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \

+    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \

+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \

+  }

 #if defined(HAS_YUY2TOARGBROW_SSSE3)

 ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)

 ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)

@@ -573,25 +905,28 @@

 ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)

 ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)

 #endif

+#if defined(HAS_YUY2TOARGBROW_MSA)

+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)

+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)

+#endif

 #undef ANY11C

 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.

-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \

-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \

-                 ptrdiff_t src_stride_ptr, int width,                          \

-                 int source_y_fraction) {                                      \

-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \

-      memset(temp, 0, 64 * 2);  /* for msan */                                 \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \

-      }                                                                        \

-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \

-      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \

-      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \

-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

-    }

+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \

+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \

+               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \

+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \

+    memset(temp, 0, 64 * 2); /* for msan */                                  \

+    int r = width & MASK;                                                    \

+    int n = width & ~MASK;                                                   \

+    if (n > 0) {                                                             \

+      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \

+    }                                                                        \

+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \

+    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \

+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \

+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \

+  }

 #ifdef HAS_INTERPOLATEROW_AVX2

 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)

@@ -602,25 +937,25 @@

 #ifdef HAS_INTERPOLATEROW_NEON

 ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)

 #endif

-#ifdef HAS_INTERPOLATEROW_DSPR2

-ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)

+#ifdef HAS_INTERPOLATEROW_MSA

+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)

 #endif

 #undef ANY11T

 // Any 1 to 1 mirror.

-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \

-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \

-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \

-      memset(temp, 0, 64);  /* for msan */                                     \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \

-      }                                                                        \

-      memcpy(temp, src_ptr, r * BPP);                                          \

-      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \

-      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \

-    }

+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \

+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \

+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \

+    memset(temp, 0, 64); /* for msan */                                   \

+    int r = width & MASK;                                                 \

+    int n = width & ~MASK;                                                \

+    if (n > 0) {                                                          \

+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \

+    }                                                                     \

+    memcpy(temp, src_ptr, r* BPP);                                        \

+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \

+    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \

+  }

 #ifdef HAS_MIRRORROW_AVX2

 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)

@@ -631,6 +966,9 @@

 #ifdef HAS_MIRRORROW_NEON

 ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)

 #endif

+#ifdef HAS_MIRRORROW_MSA

+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)

+#endif

 #ifdef HAS_ARGBMIRRORROW_AVX2

 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)

 #endif

@@ -640,67 +978,54 @@

 #ifdef HAS_ARGBMIRRORROW_NEON

 ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)

 #endif

+#ifdef HAS_ARGBMIRRORROW_MSA

+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)

+#endif

 #undef ANY11M

 // Any 1 plane. (memset)

-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \

-    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \

-      SIMD_ALIGNED(uint8 temp[64]);                                            \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(dst_ptr, v32, n);                                             \

-      }                                                                        \

-      ANY_SIMD(temp, v32, MASK + 1);                                           \

-      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \

-    }

+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \

+  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \

+    SIMD_ALIGNED(uint8_t temp[64]);                  \

+    int r = width & MASK;                            \

+    int n = width & ~MASK;                           \

+    if (n > 0) {                                     \

+      ANY_SIMD(dst_ptr, v32, n);                     \

+    }                                                \

+    ANY_SIMD(temp, v32, MASK + 1);                   \

+    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \

+  }

 #ifdef HAS_SETROW_X86

-ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)

+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)

 #endif

 #ifdef HAS_SETROW_NEON

-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)

+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)

 #endif

 #ifdef HAS_ARGBSETROW_NEON

-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)

+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)

 #endif

+#ifdef HAS_ARGBSETROW_MSA

+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)

+#endif

 #undef ANY1

 // Any 1 to 2.  Outputs UV planes.

-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \

-    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\

-      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \

-      memset(temp, 0, 128);  /* for msan */                                    \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \

-      }                                                                        \

-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \

-      /* repeat last 4 bytes for 422 subsampler */                             \

-      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \

-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

-      }                                                                        \

-      /* repeat last 4 - 12 bytes for 411 subsampler */                        \

-      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \

-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

-        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \

-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \

-      }                                                                        \

-      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \

-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

-               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \

-      }                                                                        \

-      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \

-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

-      }                                                                        \

-      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \

-      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \

-      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \

-    }

+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \

+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \

+               int width) {                                             \

+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \

+    memset(temp, 0, 128); /* for msan */                                \

+    int r = width & MASK;                                               \

+    int n = width & ~MASK;                                              \

+    if (n > 0) {                                                        \

+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \

+    }                                                                   \

+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \

+    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \

+    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \

+    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \

+  }

 #ifdef HAS_SPLITUVROW_SSE2

 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)

@@ -711,8 +1036,8 @@

 #ifdef HAS_SPLITUVROW_NEON

 ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)

 #endif

-#ifdef HAS_SPLITUVROW_DSPR2

-ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)

+#ifdef HAS_SPLITUVROW_MSA

+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)

 #endif

 #ifdef HAS_ARGBTOUV444ROW_SSSE3

 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)

@@ -727,37 +1052,66 @@

 #endif

 #ifdef HAS_YUY2TOUV422ROW_NEON

 ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)

-ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)

 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)

 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)

 #endif

+#ifdef HAS_YUY2TOUV422ROW_MSA

+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)

+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)

+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)

+#endif

 #undef ANY12

+// Any 1 to 3.  Outputs RGB planes.

+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \

+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \

+               uint8_t* dst_b, int width) {                                \

+    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \

+    memset(temp, 0, 16 * 3); /* for msan */                                \

+    int r = width & MASK;                                                  \

+    int n = width & ~MASK;                                                 \

+    if (n > 0) {                                                           \

+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \

+    }                                                                      \

+    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \

+    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \

+    memcpy(dst_r + n, temp + 16 * 3, r);                                   \

+    memcpy(dst_g + n, temp + 16 * 4, r);                                   \

+    memcpy(dst_b + n, temp + 16 * 5, r);                                   \

+  }

+#ifdef HAS_SPLITRGBROW_SSSE3

+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)

+#endif

+#ifdef HAS_SPLITRGBROW_NEON

+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)

+#endif

 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.

 // 128 byte row allows for 32 avx ARGB pixels.

-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \

-    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \

-                 uint8* dst_u, uint8* dst_v, int width) {                      \

-      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \

-      memset(temp, 0, 128 * 2);  /* for msan */                                \

-      int r = width & MASK;                                                    \

-      int n = width & ~MASK;                                                   \

-      if (n > 0) {                                                             \

-        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \

-      }                                                                        \

-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \

-      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \

-             SS(r, UVSHIFT) * BPP);                                            \

-      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\

-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \

-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \

-        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \

-               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \

-      }                                                                        \

-      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \

-      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \

-      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \

-    }

+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \

+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \

+               uint8_t* dst_v, int width) {                                  \

+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \

+    memset(temp, 0, 128 * 2); /* for msan */                                 \

+    int r = width & MASK;                                                    \

+    int n = width & ~MASK;                                                   \

+    if (n > 0) {                                                             \

+      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \

+    }                                                                        \

+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \

+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \

+           SS(r, UVSHIFT) * BPP);                                            \

+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \

+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \

+             BPP);                                                           \

+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \

+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \

+    }                                                                        \

+    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \

+    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \

+    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \

+  }

 #ifdef HAS_ARGBTOUVROW_AVX2

 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)

@@ -783,30 +1137,57 @@

 #ifdef HAS_ARGBTOUVROW_NEON

 ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)

 #endif

+#ifdef HAS_ARGBTOUVROW_MSA

+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)

+#endif

 #ifdef HAS_ARGBTOUVJROW_NEON

 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)

 #endif

+#ifdef HAS_ARGBTOUVJROW_MSA

+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)

+#endif

 #ifdef HAS_BGRATOUVROW_NEON

 ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)

 #endif

+#ifdef HAS_BGRATOUVROW_MSA

+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)

+#endif

 #ifdef HAS_ABGRTOUVROW_NEON

 ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)

 #endif

+#ifdef HAS_ABGRTOUVROW_MSA

+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)

+#endif

 #ifdef HAS_RGBATOUVROW_NEON

 ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)

 #endif

+#ifdef HAS_RGBATOUVROW_MSA

+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)

+#endif

 #ifdef HAS_RGB24TOUVROW_NEON

 ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)

 #endif

+#ifdef HAS_RGB24TOUVROW_MSA

+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)

+#endif

 #ifdef HAS_RAWTOUVROW_NEON

 ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)

 #endif

+#ifdef HAS_RAWTOUVROW_MSA

+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)

+#endif

 #ifdef HAS_RGB565TOUVROW_NEON

 ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)

 #endif

+#ifdef HAS_RGB565TOUVROW_MSA

+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)

+#endif

 #ifdef HAS_ARGB1555TOUVROW_NEON

 ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)

 #endif

+#ifdef HAS_ARGB1555TOUVROW_MSA

+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)

+#endif

 #ifdef HAS_ARGB4444TOUVROW_NEON

 ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)

 #endif

@@ -815,6 +1196,12 @@

 #endif

 #ifdef HAS_UYVYTOUVROW_NEON

 ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)

+#endif

+#ifdef HAS_YUY2TOUVROW_MSA

+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)

+#endif

+#ifdef HAS_UYVYTOUVROW_MSA

+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)

 #endif

 #undef ANY12S

--- a/third_party/libyuv/source/row_common.cc

+++ b/third_party/libyuv/source/row_common.cc

@@ -10,6 +10,7 @@

 #include "libyuv/row.h"

+#include <stdio.h>

 #include <string.h>  // For memcpy and memset.

 #include "libyuv/basic_types.h"

@@ -23,59 +24,69 @@

 #define USE_BRANCHLESS 1

 #if USE_BRANCHLESS

-static __inline int32 clamp0(int32 v) {

+static __inline int32_t clamp0(int32_t v) {

   return ((-(v) >> 31) & (v));

-static __inline int32 clamp255(int32 v) {

+static __inline int32_t clamp255(int32_t v) {

   return (((255 - (v)) >> 31) | (v)) & 255;

-static __inline uint32 Clamp(int32 val) {

-  int v = clamp0(val);

-  return (uint32)(clamp255(v));

+static __inline int32_t clamp1023(int32_t v) {

+  return (((1023 - (v)) >> 31) | (v)) & 1023;

-static __inline uint32 Abs(int32 v) {

+static __inline uint32_t Abs(int32_t v) {

   int m = v >> 31;

   return (v + m) ^ m;

-#else  // USE_BRANCHLESS

-static __inline int32 clamp0(int32 v) {

+#else   // USE_BRANCHLESS

+static __inline int32_t clamp0(int32_t v) {

   return (v < 0) ? 0 : v;

-static __inline int32 clamp255(int32 v) {

+static __inline int32_t clamp255(int32_t v) {

   return (v > 255) ? 255 : v;

-static __inline uint32 Clamp(int32 val) {

-  int v = clamp0(val);

-  return (uint32)(clamp255(v));

+static __inline int32_t clamp1023(int32_t v) {

+  return (v > 1023) ? 1023 : v;

-static __inline uint32 Abs(int32 v) {

+static __inline uint32_t Abs(int32_t v) {

   return (v < 0) ? -v : v;

 #endif  // USE_BRANCHLESS

+static __inline uint32_t Clamp(int32_t val) {

+  int v = clamp0(val);

+  return (uint32_t)(clamp255(v));

+}

-#ifdef LIBYUV_LITTLE_ENDIAN

-#define WRITEWORD(p, v) *(uint32*)(p) = v

+static __inline uint32_t Clamp10(int32_t val) {

+  int v = clamp0(val);

+  return (uint32_t)(clamp1023(v));

+}

+// Little Endian

+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \

+    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \

+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)

+#define WRITEWORD(p, v) *(uint32_t*)(p) = v

 #else

-static inline void WRITEWORD(uint8* p, uint32 v) {

-  p[0] = (uint8)(v & 255);

-  p[1] = (uint8)((v >> 8) & 255);

-  p[2] = (uint8)((v >> 16) & 255);

-  p[3] = (uint8)((v >> 24) & 255);

+static inline void WRITEWORD(uint8_t* p, uint32_t v) {

+  p[0] = (uint8_t)(v & 255);

+  p[1] = (uint8_t)((v >> 8) & 255);

+  p[2] = (uint8_t)((v >> 16) & 255);

+  p[3] = (uint8_t)((v >> 24) & 255);

 #endif

-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {

+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_rgb24[0];

-    uint8 g = src_rgb24[1];

-    uint8 r = src_rgb24[2];

+    uint8_t b = src_rgb24[0];

+    uint8_t g = src_rgb24[1];

+    uint8_t r = src_rgb24[2];

     dst_argb[0] = b;

     dst_argb[1] = g;

     dst_argb[2] = r;

@@ -85,12 +96,12 @@

-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {

+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 r = src_raw[0];

-    uint8 g = src_raw[1];

-    uint8 b = src_raw[2];

+    uint8_t r = src_raw[0];

+    uint8_t g = src_raw[1];

+    uint8_t b = src_raw[2];

     dst_argb[0] = b;

     dst_argb[1] = g;

     dst_argb[2] = r;

@@ -100,12 +111,12 @@

-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {

+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 r = src_raw[0];

-    uint8 g = src_raw[1];

-    uint8 b = src_raw[2];

+    uint8_t r = src_raw[0];

+    uint8_t g = src_raw[1];

+    uint8_t b = src_raw[2];

     dst_rgb24[0] = b;

     dst_rgb24[1] = g;

     dst_rgb24[2] = r;

@@ -114,12 +125,14 @@

-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {

+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,

+                       uint8_t* dst_argb,

+                       int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_rgb565[0] & 0x1f;

-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

-    uint8 r = src_rgb565[1] >> 3;

+    uint8_t b = src_rgb565[0] & 0x1f;

+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8_t r = src_rgb565[1] >> 3;

     dst_argb[0] = (b << 3) | (b >> 2);

     dst_argb[1] = (g << 2) | (g >> 4);

     dst_argb[2] = (r << 3) | (r >> 2);

@@ -129,14 +142,15 @@

-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,

+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,

+                         uint8_t* dst_argb,

                          int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_argb1555[0] & 0x1f;

-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;

-    uint8 a = src_argb1555[1] >> 7;

+    uint8_t b = src_argb1555[0] & 0x1f;

+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;

+    uint8_t a = src_argb1555[1] >> 7;

     dst_argb[0] = (b << 3) | (b >> 2);

     dst_argb[1] = (g << 3) | (g >> 2);

     dst_argb[2] = (r << 3) | (r >> 2);

@@ -146,14 +160,15 @@

-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,

+                         uint8_t* dst_argb,

                          int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_argb4444[0] & 0x0f;

-    uint8 g = src_argb4444[0] >> 4;

-    uint8 r = src_argb4444[1] & 0x0f;

-    uint8 a = src_argb4444[1] >> 4;

+    uint8_t b = src_argb4444[0] & 0x0f;

+    uint8_t g = src_argb4444[0] >> 4;

+    uint8_t r = src_argb4444[1] & 0x0f;

+    uint8_t a = src_argb4444[1] >> 4;

     dst_argb[0] = (b << 4) | b;

     dst_argb[1] = (g << 4) | g;

     dst_argb[2] = (r << 4) | r;

@@ -163,12 +178,53 @@

-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_argb[0];

-    uint8 g = src_argb[1];

-    uint8 r = src_argb[2];

+    uint32_t ar30 = *(const uint32_t*)src_ar30;

+    uint32_t b = (ar30 >> 2) & 0xff;

+    uint32_t g = (ar30 >> 12) & 0xff;

+    uint32_t r = (ar30 >> 22) & 0xff;

+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.

+    *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);

+    dst_argb += 4;

+    src_ar30 += 4;

+  }

+}

+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint32_t ar30 = *(const uint32_t*)src_ar30;

+    uint32_t b = (ar30 >> 2) & 0xff;

+    uint32_t g = (ar30 >> 12) & 0xff;

+    uint32_t r = (ar30 >> 22) & 0xff;

+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.

+    *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);

+    dst_abgr += 4;

+    src_ar30 += 4;

+  }

+}

+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint32_t ar30 = *(const uint32_t*)src_ar30;

+    uint32_t b = ar30 & 0x3ff;

+    uint32_t ga = ar30 & 0xc00ffc00;

+    uint32_t r = (ar30 >> 20) & 0x3ff;

+    *(uint32_t*)(dst_ab30) = r | ga | (b << 20);

+    dst_ab30 += 4;

+    src_ar30 += 4;

+  }

+}

+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint8_t b = src_argb[0];

+    uint8_t g = src_argb[1];

+    uint8_t r = src_argb[2];

     dst_rgb[0] = b;

     dst_rgb[1] = g;

     dst_rgb[2] = r;

@@ -177,12 +233,12 @@

-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_argb[0];

-    uint8 g = src_argb[1];

-    uint8 r = src_argb[2];

+    uint8_t b = src_argb[0];

+    uint8_t g = src_argb[1];

+    uint8_t r = src_argb[2];

     dst_rgb[0] = r;

     dst_rgb[1] = g;

     dst_rgb[2] = b;

@@ -191,25 +247,25 @@

-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint8 b0 = src_argb[0] >> 3;

-    uint8 g0 = src_argb[1] >> 2;

-    uint8 r0 = src_argb[2] >> 3;

-    uint8 b1 = src_argb[4] >> 3;

-    uint8 g1 = src_argb[5] >> 2;

-    uint8 r1 = src_argb[6] >> 3;

-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |

-              (b1 << 16) | (g1 << 21) | (r1 << 27));

+    uint8_t b0 = src_argb[0] >> 3;

+    uint8_t g0 = src_argb[1] >> 2;

+    uint8_t r0 = src_argb[2] >> 3;

+    uint8_t b1 = src_argb[4] >> 3;

+    uint8_t g1 = src_argb[5] >> 2;

+    uint8_t r1 = src_argb[6] >> 3;

+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |

+                           (r1 << 27));

     dst_rgb += 4;

     src_argb += 8;

   if (width & 1) {

-    uint8 b0 = src_argb[0] >> 3;

-    uint8 g0 = src_argb[1] >> 2;

-    uint8 r0 = src_argb[2] >> 3;

-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);

+    uint8_t b0 = src_argb[0] >> 3;

+    uint8_t g0 = src_argb[1] >> 2;

+    uint8_t r0 = src_argb[2] >> 3;

+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);

@@ -221,132 +277,160 @@

 // endian will not affect order of the original matrix.  But the dither4

 // will containing the first pixel in the lower byte for little endian

 // or the upper byte for big endian.

-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,

-                             const uint32 dither4, int width) {

+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,

+                             uint8_t* dst_rgb,

+                             const uint32_t dither4,

+                             int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     int dither0 = ((const unsigned char*)(&dither4))[x & 3];

     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];

-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;

-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;

-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;

-    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;

-    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;

-    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;

-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |

-              (b1 << 16) | (g1 << 21) | (r1 << 27));

+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;

+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;

+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;

+    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;

+    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;

+    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;

+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |

+                           (r1 << 27));

     dst_rgb += 4;

     src_argb += 8;

   if (width & 1) {

     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];

-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;

-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;

-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;

-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);

+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;

+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;

+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;

+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);

-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint8 b0 = src_argb[0] >> 3;

-    uint8 g0 = src_argb[1] >> 3;

-    uint8 r0 = src_argb[2] >> 3;

-    uint8 a0 = src_argb[3] >> 7;

-    uint8 b1 = src_argb[4] >> 3;

-    uint8 g1 = src_argb[5] >> 3;

-    uint8 r1 = src_argb[6] >> 3;

-    uint8 a1 = src_argb[7] >> 7;

-    *(uint32*)(dst_rgb) =

-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |

-        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);

+    uint8_t b0 = src_argb[0] >> 3;

+    uint8_t g0 = src_argb[1] >> 3;

+    uint8_t r0 = src_argb[2] >> 3;

+    uint8_t a0 = src_argb[3] >> 7;

+    uint8_t b1 = src_argb[4] >> 3;

+    uint8_t g1 = src_argb[5] >> 3;

+    uint8_t r1 = src_argb[6] >> 3;

+    uint8_t a1 = src_argb[7] >> 7;

+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |

+                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);

     dst_rgb += 4;

     src_argb += 8;

   if (width & 1) {

-    uint8 b0 = src_argb[0] >> 3;

-    uint8 g0 = src_argb[1] >> 3;

-    uint8 r0 = src_argb[2] >> 3;

-    uint8 a0 = src_argb[3] >> 7;

-    *(uint16*)(dst_rgb) =

-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);

+    uint8_t b0 = src_argb[0] >> 3;

+    uint8_t g0 = src_argb[1] >> 3;

+    uint8_t r0 = src_argb[2] >> 3;

+    uint8_t a0 = src_argb[3] >> 7;

+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);

-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {

+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint8 b0 = src_argb[0] >> 4;

-    uint8 g0 = src_argb[1] >> 4;

-    uint8 r0 = src_argb[2] >> 4;

-    uint8 a0 = src_argb[3] >> 4;

-    uint8 b1 = src_argb[4] >> 4;

-    uint8 g1 = src_argb[5] >> 4;

-    uint8 r1 = src_argb[6] >> 4;

-    uint8 a1 = src_argb[7] >> 4;

-    *(uint32*)(dst_rgb) =

-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |

-        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);

+    uint8_t b0 = src_argb[0] >> 4;

+    uint8_t g0 = src_argb[1] >> 4;

+    uint8_t r0 = src_argb[2] >> 4;

+    uint8_t a0 = src_argb[3] >> 4;

+    uint8_t b1 = src_argb[4] >> 4;

+    uint8_t g1 = src_argb[5] >> 4;

+    uint8_t r1 = src_argb[6] >> 4;

+    uint8_t a1 = src_argb[7] >> 4;

+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |

+                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);

     dst_rgb += 4;

     src_argb += 8;

   if (width & 1) {

-    uint8 b0 = src_argb[0] >> 4;

-    uint8 g0 = src_argb[1] >> 4;

-    uint8 r0 = src_argb[2] >> 4;

-    uint8 a0 = src_argb[3] >> 4;

-    *(uint16*)(dst_rgb) =

-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);

+    uint8_t b0 = src_argb[0] >> 4;

+    uint8_t g0 = src_argb[1] >> 4;

+    uint8_t r0 = src_argb[2] >> 4;

+    uint8_t a0 = src_argb[3] >> 4;

+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);

-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {

-  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;

+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);

+    uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);

+    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);

+    uint32_t a0 = (src_abgr[3] >> 6);

+    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);

+    dst_ar30 += 4;

+    src_abgr += 4;

+  }

-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {

+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);

+    uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);

+    uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);

+    uint32_t a0 = (src_argb[3] >> 6);

+    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);

+    dst_ar30 += 4;

+    src_argb += 4;

+  }

+}

+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {

+  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;

+}

+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {

   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;

-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {

+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {

   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;

-#define MAKEROWY(NAME, R, G, B, BPP) \

-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \

-  int x;                                                                       \

-  for (x = 0; x < width; ++x) {                                                \

-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \

-    src_argb0 += BPP;                                                          \

-    dst_y += 1;                                                                \

-  }                                                                            \

-}                                                                              \

-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \

-                       uint8* dst_u, uint8* dst_v, int width) {                \

-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \

-  int x;                                                                       \

-  for (x = 0; x < width - 1; x += 2) {                                         \

-    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \

-               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \

-    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \

-               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \

-    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \

-               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \

-    dst_u[0] = RGBToU(ar, ag, ab);                                             \

-    dst_v[0] = RGBToV(ar, ag, ab);                                             \

-    src_rgb0 += BPP * 2;                                                       \

-    src_rgb1 += BPP * 2;                                                       \

-    dst_u += 1;                                                                \

-    dst_v += 1;                                                                \

-  }                                                                            \

-  if (width & 1) {                                                             \

-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \

-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \

-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \

-    dst_u[0] = RGBToU(ar, ag, ab);                                             \

-    dst_v[0] = RGBToV(ar, ag, ab);                                             \

-  }                                                                            \

-}

+// ARGBToY_C and ARGBToUV_C

+#define MAKEROWY(NAME, R, G, B, BPP)                                         \

+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \

+    int x;                                                                   \

+    for (x = 0; x < width; ++x) {                                            \

+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \

+      src_argb0 += BPP;                                                      \

+      dst_y += 1;                                                            \

+    }                                                                        \

+  }                                                                          \

+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \

+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \

+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \

+    int x;                                                                   \

+    for (x = 0; x < width - 1; x += 2) {                                     \

+      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \

+                    src_rgb1[B + BPP]) >>                                    \

+                   2;                                                        \

+      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \

+                    src_rgb1[G + BPP]) >>                                    \

+                   2;                                                        \

+      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \

+                    src_rgb1[R + BPP]) >>                                    \

+                   2;                                                        \

+      dst_u[0] = RGBToU(ar, ag, ab);                                         \

+      dst_v[0] = RGBToV(ar, ag, ab);                                         \

+      src_rgb0 += BPP * 2;                                                   \

+      src_rgb1 += BPP * 2;                                                   \

+      dst_u += 1;                                                            \

+      dst_v += 1;                                                            \

+    }                                                                        \

+    if (width & 1) {                                                         \

+      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \

+      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \

+      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \

+      dst_u[0] = RGBToU(ar, ag, ab);                                         \

+      dst_v[0] = RGBToV(ar, ag, ab);                                         \

+    }                                                                        \

+  }

 MAKEROWY(ARGB, 2, 1, 0, 4)

 MAKEROWY(BGRA, 1, 2, 3, 4)

@@ -381,64 +465,65 @@

 // g -0.41869 * 255 = -106.76595 = -107

 // r  0.50000 * 255 = 127.5 = 127

-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {

-  return (38 * r + 75 * g +  15 * b + 64) >> 7;

+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {

+  return (38 * r + 75 * g + 15 * b + 64) >> 7;

-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {

+static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {

   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;

-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {

+static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {

   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;

 #define AVGB(a, b) (((a) + (b) + 1) >> 1)

-#define MAKEROWYJ(NAME, R, G, B, BPP) \

-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \

-  int x;                                                                       \

-  for (x = 0; x < width; ++x) {                                                \

-    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \

-    src_argb0 += BPP;                                                          \

-    dst_y += 1;                                                                \

-  }                                                                            \

-}                                                                              \

-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \

-                        uint8* dst_u, uint8* dst_v, int width) {               \

-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \

-  int x;                                                                       \

-  for (x = 0; x < width - 1; x += 2) {                                         \

-    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \

-                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \

-    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \

-                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \

-    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \

-                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \

-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \

-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \

-    src_rgb0 += BPP * 2;                                                       \

-    src_rgb1 += BPP * 2;                                                       \

-    dst_u += 1;                                                                \

-    dst_v += 1;                                                                \

-  }                                                                            \

-  if (width & 1) {                                                             \

-    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \

-    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \

-    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \

-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \

-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \

-  }                                                                            \

-}

+// ARGBToYJ_C and ARGBToUVJ_C

+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \

+  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \

+    int x;                                                                    \

+    for (x = 0; x < width; ++x) {                                             \

+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \

+      src_argb0 += BPP;                                                       \

+      dst_y += 1;                                                             \

+    }                                                                         \

+  }                                                                           \

+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \

+                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \

+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \

+    int x;                                                                    \

+    for (x = 0; x < width - 1; x += 2) {                                      \

+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \

+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \

+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \

+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \

+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \

+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \

+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \

+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \

+      src_rgb0 += BPP * 2;                                                    \

+      src_rgb1 += BPP * 2;                                                    \

+      dst_u += 1;                                                             \

+      dst_v += 1;                                                             \

+    }                                                                         \

+    if (width & 1) {                                                          \

+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \

+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \

+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \

+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \

+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \

+    }                                                                         \

+  }

 MAKEROWYJ(ARGB, 2, 1, 0, 4)

 #undef MAKEROWYJ

-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {

+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_rgb565[0] & 0x1f;

-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

-    uint8 r = src_rgb565[1] >> 3;

+    uint8_t b = src_rgb565[0] & 0x1f;

+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8_t r = src_rgb565[1] >> 3;

     b = (b << 3) | (b >> 2);

     g = (g << 2) | (g >> 4);

     r = (r << 3) | (r >> 2);

@@ -448,12 +533,12 @@

-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {

+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_argb1555[0] & 0x1f;

-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;

+    uint8_t b = src_argb1555[0] & 0x1f;

+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;

     b = (b << 3) | (b >> 2);

     g = (g << 3) | (g >> 2);

     r = (r << 3) | (r >> 2);

@@ -463,12 +548,12 @@

-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {

+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 b = src_argb4444[0] & 0x0f;

-    uint8 g = src_argb4444[0] >> 4;

-    uint8 r = src_argb4444[1] & 0x0f;

+    uint8_t b = src_argb4444[0] & 0x0f;

+    uint8_t g = src_argb4444[0] >> 4;

+    uint8_t r = src_argb4444[1] & 0x0f;

     b = (b << 4) | b;

     g = (g << 4) | g;

     r = (r << 4) | r;

@@ -478,26 +563,29 @@

-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,

-                     uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;

+void RGB565ToUVRow_C(const uint8_t* src_rgb565,

+                     int src_stride_rgb565,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint8 b0 = src_rgb565[0] & 0x1f;

-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

-    uint8 r0 = src_rgb565[1] >> 3;

-    uint8 b1 = src_rgb565[2] & 0x1f;

-    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);

-    uint8 r1 = src_rgb565[3] >> 3;

-    uint8 b2 = next_rgb565[0] & 0x1f;

-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);

-    uint8 r2 = next_rgb565[1] >> 3;

-    uint8 b3 = next_rgb565[2] & 0x1f;

-    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);

-    uint8 r3 = next_rgb565[3] >> 3;

-    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.

-    uint8 g = (g0 + g1 + g2 + g3);

-    uint8 r = (r0 + r1 + r2 + r3);

+    uint8_t b0 = src_rgb565[0] & 0x1f;

+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8_t r0 = src_rgb565[1] >> 3;

+    uint8_t b1 = src_rgb565[2] & 0x1f;

+    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);

+    uint8_t r1 = src_rgb565[3] >> 3;

+    uint8_t b2 = next_rgb565[0] & 0x1f;

+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);

+    uint8_t r2 = next_rgb565[1] >> 3;

+    uint8_t b3 = next_rgb565[2] & 0x1f;

+    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);

+    uint8_t r3 = next_rgb565[3] >> 3;

+    uint8_t b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.

+    uint8_t g = (g0 + g1 + g2 + g3);

+    uint8_t r = (r0 + r1 + r2 + r3);

     b = (b << 1) | (b >> 6);  // 787 -> 888.

     r = (r << 1) | (r >> 6);

     dst_u[0] = RGBToU(r, g, b);

@@ -508,15 +596,15 @@

     dst_v += 1;

   if (width & 1) {

-    uint8 b0 = src_rgb565[0] & 0x1f;

-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

-    uint8 r0 = src_rgb565[1] >> 3;

-    uint8 b2 = next_rgb565[0] & 0x1f;

-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);

-    uint8 r2 = next_rgb565[1] >> 3;

-    uint8 b = (b0 + b2);  // 565 * 2 = 676.

-    uint8 g = (g0 + g2);

-    uint8 r = (r0 + r2);

+    uint8_t b0 = src_rgb565[0] & 0x1f;

+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);

+    uint8_t r0 = src_rgb565[1] >> 3;

+    uint8_t b2 = next_rgb565[0] & 0x1f;

+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);

+    uint8_t r2 = next_rgb565[1] >> 3;

+    uint8_t b = (b0 + b2);  // 565 * 2 = 676.

+    uint8_t g = (g0 + g2);

+    uint8_t r = (r0 + r2);

     b = (b << 2) | (b >> 4);  // 676 -> 888

     g = (g << 1) | (g >> 6);

     r = (r << 2) | (r >> 4);

@@ -525,26 +613,29 @@

-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;

+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,

+                       int src_stride_argb1555,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint8 b0 = src_argb1555[0] & 0x1f;

-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;

-    uint8 b1 = src_argb1555[2] & 0x1f;

-    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);

-    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;

-    uint8 b2 = next_argb1555[0] & 0x1f;

-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);

-    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;

-    uint8 b3 = next_argb1555[2] & 0x1f;

-    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);

-    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;

-    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.

-    uint8 g = (g0 + g1 + g2 + g3);

-    uint8 r = (r0 + r1 + r2 + r3);

+    uint8_t b0 = src_argb1555[0] & 0x1f;

+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;

+    uint8_t b1 = src_argb1555[2] & 0x1f;

+    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);

+    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;

+    uint8_t b2 = next_argb1555[0] & 0x1f;

+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);

+    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;

+    uint8_t b3 = next_argb1555[2] & 0x1f;

+    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);

+    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;

+    uint8_t b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.

+    uint8_t g = (g0 + g1 + g2 + g3);

+    uint8_t r = (r0 + r1 + r2 + r3);

     b = (b << 1) | (b >> 6);  // 777 -> 888.

     g = (g << 1) | (g >> 6);

     r = (r << 1) | (r >> 6);

@@ -556,15 +647,15 @@

     dst_v += 1;

   if (width & 1) {

-    uint8 b0 = src_argb1555[0] & 0x1f;

-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;

-    uint8 b2 = next_argb1555[0] & 0x1f;

-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);

-    uint8 r2 = next_argb1555[1] >> 3;

-    uint8 b = (b0 + b2);  // 555 * 2 = 666.

-    uint8 g = (g0 + g2);

-    uint8 r = (r0 + r2);

+    uint8_t b0 = src_argb1555[0] & 0x1f;

+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);

+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;

+    uint8_t b2 = next_argb1555[0] & 0x1f;

+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);

+    uint8_t r2 = next_argb1555[1] >> 3;

+    uint8_t b = (b0 + b2);  // 555 * 2 = 666.

+    uint8_t g = (g0 + g2);

+    uint8_t r = (r0 + r2);

     b = (b << 2) | (b >> 4);  // 666 -> 888.

     g = (g << 2) | (g >> 4);

     r = (r << 2) | (r >> 4);

@@ -573,26 +664,29 @@

-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;

+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,

+                       int src_stride_argb4444,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint8 b0 = src_argb4444[0] & 0x0f;

-    uint8 g0 = src_argb4444[0] >> 4;

-    uint8 r0 = src_argb4444[1] & 0x0f;

-    uint8 b1 = src_argb4444[2] & 0x0f;

-    uint8 g1 = src_argb4444[2] >> 4;

-    uint8 r1 = src_argb4444[3] & 0x0f;

-    uint8 b2 = next_argb4444[0] & 0x0f;

-    uint8 g2 = next_argb4444[0] >> 4;

-    uint8 r2 = next_argb4444[1] & 0x0f;

-    uint8 b3 = next_argb4444[2] & 0x0f;

-    uint8 g3 = next_argb4444[2] >> 4;

-    uint8 r3 = next_argb4444[3] & 0x0f;

-    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.

-    uint8 g = (g0 + g1 + g2 + g3);

-    uint8 r = (r0 + r1 + r2 + r3);

+    uint8_t b0 = src_argb4444[0] & 0x0f;

+    uint8_t g0 = src_argb4444[0] >> 4;

+    uint8_t r0 = src_argb4444[1] & 0x0f;

+    uint8_t b1 = src_argb4444[2] & 0x0f;

+    uint8_t g1 = src_argb4444[2] >> 4;

+    uint8_t r1 = src_argb4444[3] & 0x0f;

+    uint8_t b2 = next_argb4444[0] & 0x0f;

+    uint8_t g2 = next_argb4444[0] >> 4;

+    uint8_t r2 = next_argb4444[1] & 0x0f;

+    uint8_t b3 = next_argb4444[2] & 0x0f;

+    uint8_t g3 = next_argb4444[2] >> 4;

+    uint8_t r3 = next_argb4444[3] & 0x0f;

+    uint8_t b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.

+    uint8_t g = (g0 + g1 + g2 + g3);

+    uint8_t r = (r0 + r1 + r2 + r3);

     b = (b << 2) | (b >> 4);  // 666 -> 888.

     g = (g << 2) | (g >> 4);

     r = (r << 2) | (r >> 4);

@@ -604,15 +698,15 @@

     dst_v += 1;

   if (width & 1) {

-    uint8 b0 = src_argb4444[0] & 0x0f;

-    uint8 g0 = src_argb4444[0] >> 4;

-    uint8 r0 = src_argb4444[1] & 0x0f;

-    uint8 b2 = next_argb4444[0] & 0x0f;

-    uint8 g2 = next_argb4444[0] >> 4;

-    uint8 r2 = next_argb4444[1] & 0x0f;

-    uint8 b = (b0 + b2);  // 444 * 2 = 555.

-    uint8 g = (g0 + g2);

-    uint8 r = (r0 + r2);

+    uint8_t b0 = src_argb4444[0] & 0x0f;

+    uint8_t g0 = src_argb4444[0] >> 4;

+    uint8_t r0 = src_argb4444[1] & 0x0f;

+    uint8_t b2 = next_argb4444[0] & 0x0f;

+    uint8_t g2 = next_argb4444[0] >> 4;

+    uint8_t r2 = next_argb4444[1] & 0x0f;

+    uint8_t b = (b0 + b2);  // 444 * 2 = 555.

+    uint8_t g = (g0 + g2);

+    uint8_t r = (r0 + r2);

     b = (b << 3) | (b >> 2);  // 555 -> 888.

     g = (g << 3) | (g >> 2);

     r = (r << 3) | (r >> 2);

@@ -621,13 +715,15 @@

-void ARGBToUV444Row_C(const uint8* src_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void ARGBToUV444Row_C(const uint8_t* src_argb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 ab = src_argb[0];

-    uint8 ag = src_argb[1];

-    uint8 ar = src_argb[2];

+    uint8_t ab = src_argb[0];

+    uint8_t ag = src_argb[1];

+    uint8_t ar = src_argb[2];

     dst_u[0] = RGBToU(ar, ag, ab);

     dst_v[0] = RGBToV(ar, ag, ab);

     src_argb += 4;

@@ -636,45 +732,10 @@

-void ARGBToUV411Row_C(const uint8* src_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {

   int x;

-  for (x = 0; x < width - 3; x += 4) {

-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;

-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;

-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;

-    dst_u[0] = RGBToU(ar, ag, ab);

-    dst_v[0] = RGBToV(ar, ag, ab);

-    src_argb += 16;

-    dst_u += 1;

-    dst_v += 1;

-  }

-  // Odd width handling mimics 'any' function which replicates last pixel.

-  if ((width & 3) == 3) {

-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;

-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;

-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;

-    dst_u[0] = RGBToU(ar, ag, ab);

-    dst_v[0] = RGBToV(ar, ag, ab);

-  } else if ((width & 3) == 2) {

-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;

-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;

-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;

-    dst_u[0] = RGBToU(ar, ag, ab);

-    dst_v[0] = RGBToV(ar, ag, ab);

-  } else if ((width & 3) == 1) {

-    uint8 ab = src_argb[0];

-    uint8 ag = src_argb[1];

-    uint8 ar = src_argb[2];

-    dst_u[0] = RGBToU(ar, ag, ab);

-    dst_v[0] = RGBToV(ar, ag, ab);

-  }

-}

-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {

-  int x;

   for (x = 0; x < width; ++x) {

-    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);

+    uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);

     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;

     dst_argb[3] = src_argb[3];

     dst_argb += 4;

@@ -683,7 +744,7 @@

 // Convert a row of image to Sepia tone.

-void ARGBSepiaRow_C(uint8* dst_argb, int width) {

+void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {

   int x;

   for (x = 0; x < width; ++x) {

     int b = dst_argb[0];

@@ -702,8 +763,10 @@

 // Apply color matrix to a row of image. Matrix is signed.

 // TODO(fbarchard): Consider adding rounding (+32).

-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,

-                          const int8* matrix_argb, int width) {

+void ARGBColorMatrixRow_C(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          const int8_t* matrix_argb,

+                          int width) {

   int x;

   for (x = 0; x < width; ++x) {

     int b = src_argb[0];

@@ -710,14 +773,18 @@

     int g = src_argb[1];

     int r = src_argb[2];

     int a = src_argb[3];

-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +

-              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;

-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +

-              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;

-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +

-              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;

-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +

-              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;

+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +

+              a * matrix_argb[3]) >>

+             6;

+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +

+              a * matrix_argb[7]) >>

+             6;

+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +

+              a * matrix_argb[11]) >>

+             6;

+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +

+              a * matrix_argb[15]) >>

+             6;

     dst_argb[0] = Clamp(sb);

     dst_argb[1] = Clamp(sg);

     dst_argb[2] = Clamp(sr);

@@ -728,7 +795,9 @@

 // Apply color table to a row of image.

-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {

+void ARGBColorTableRow_C(uint8_t* dst_argb,

+                         const uint8_t* table_argb,

+                         int width) {

   int x;

   for (x = 0; x < width; ++x) {

     int b = dst_argb[0];

@@ -744,7 +813,9 @@

 // Apply color table to a row of image.

-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {

+void RGBColorTableRow_C(uint8_t* dst_argb,

+                        const uint8_t* table_argb,

+                        int width) {

   int x;

   for (x = 0; x < width; ++x) {

     int b = dst_argb[0];

@@ -757,8 +828,11 @@

-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,

-                       int interval_offset, int width) {

+void ARGBQuantizeRow_C(uint8_t* dst_argb,

+                       int scale,

+                       int interval_size,

+                       int interval_offset,

+                       int width) {

   int x;

   for (x = 0; x < width; ++x) {

     int b = dst_argb[0];

@@ -772,21 +846,23 @@

 #define REPEAT8(v) (v) | ((v) << 8)

-#define SHADE(f, v) v * f >> 24

+#define SHADE(f, v) v* f >> 24

-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,

-                    uint32 value) {

-  const uint32 b_scale = REPEAT8(value & 0xff);

-  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);

-  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);

-  const uint32 a_scale = REPEAT8(value >> 24);

+void ARGBShadeRow_C(const uint8_t* src_argb,

+                    uint8_t* dst_argb,

+                    int width,

+                    uint32_t value) {

+  const uint32_t b_scale = REPEAT8(value & 0xff);

+  const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);

+  const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);

+  const uint32_t a_scale = REPEAT8(value >> 24);

   int i;

   for (i = 0; i < width; ++i) {

-    const uint32 b = REPEAT8(src_argb[0]);

-    const uint32 g = REPEAT8(src_argb[1]);

-    const uint32 r = REPEAT8(src_argb[2]);

-    const uint32 a = REPEAT8(src_argb[3]);

+    const uint32_t b = REPEAT8(src_argb[0]);

+    const uint32_t g = REPEAT8(src_argb[1]);

+    const uint32_t r = REPEAT8(src_argb[2]);

+    const uint32_t a = REPEAT8(src_argb[3]);

     dst_argb[0] = SHADE(b, b_scale);

     dst_argb[1] = SHADE(g, g_scale);

     dst_argb[2] = SHADE(r, r_scale);

@@ -799,20 +875,22 @@

 #undef SHADE

 #define REPEAT8(v) (v) | ((v) << 8)

-#define SHADE(f, v) v * f >> 16

+#define SHADE(f, v) v* f >> 16

-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,

-                       uint8* dst_argb, int width) {

+void ARGBMultiplyRow_C(const uint8_t* src_argb0,

+                       const uint8_t* src_argb1,

+                       uint8_t* dst_argb,

+                       int width) {

   int i;

   for (i = 0; i < width; ++i) {

-    const uint32 b = REPEAT8(src_argb0[0]);

-    const uint32 g = REPEAT8(src_argb0[1]);

-    const uint32 r = REPEAT8(src_argb0[2]);

-    const uint32 a = REPEAT8(src_argb0[3]);

-    const uint32 b_scale = src_argb1[0];

-    const uint32 g_scale = src_argb1[1];

-    const uint32 r_scale = src_argb1[2];

-    const uint32 a_scale = src_argb1[3];

+    const uint32_t b = REPEAT8(src_argb0[0]);

+    const uint32_t g = REPEAT8(src_argb0[1]);

+    const uint32_t r = REPEAT8(src_argb0[2]);

+    const uint32_t a = REPEAT8(src_argb0[3]);

+    const uint32_t b_scale = src_argb1[0];

+    const uint32_t g_scale = src_argb1[1];

+    const uint32_t r_scale = src_argb1[2];

+    const uint32_t a_scale = src_argb1[3];

     dst_argb[0] = SHADE(b, b_scale);

     dst_argb[1] = SHADE(g, g_scale);

     dst_argb[2] = SHADE(r, r_scale);

@@ -827,8 +905,10 @@

 #define SHADE(f, v) clamp255(v + f)

-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,

-                  uint8* dst_argb, int width) {

+void ARGBAddRow_C(const uint8_t* src_argb0,

+                  const uint8_t* src_argb1,

+                  uint8_t* dst_argb,

+                  int width) {

   int i;

   for (i = 0; i < width; ++i) {

     const int b = src_argb0[0];

@@ -852,8 +932,10 @@

 #define SHADE(f, v) clamp0(f - v)

-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,

-                       uint8* dst_argb, int width) {

+void ARGBSubtractRow_C(const uint8_t* src_argb0,

+                       const uint8_t* src_argb1,

+                       uint8_t* dst_argb,

+                       int width) {

   int i;

   for (i = 0; i < width; ++i) {

     const int b = src_argb0[0];

@@ -876,8 +958,11 @@

 #undef SHADE

 // Sobel functions which mimics SSSE3.

-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,

-                 uint8* dst_sobelx, int width) {

+void SobelXRow_C(const uint8_t* src_y0,

+                 const uint8_t* src_y1,

+                 const uint8_t* src_y2,

+                 uint8_t* dst_sobelx,

+                 int width) {

   int i;

   for (i = 0; i < width; ++i) {

     int a = src_y0[i];

@@ -890,12 +975,14 @@

     int b_diff = b - b_sub;

     int c_diff = c - c_sub;

     int sobel = Abs(a_diff + b_diff * 2 + c_diff);

-    dst_sobelx[i] = (uint8)(clamp255(sobel));

+    dst_sobelx[i] = (uint8_t)(clamp255(sobel));

-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,

-                 uint8* dst_sobely, int width) {

+void SobelYRow_C(const uint8_t* src_y0,

+                 const uint8_t* src_y1,

+                 uint8_t* dst_sobely,

+                 int width) {

   int i;

   for (i = 0; i < width; ++i) {

     int a = src_y0[i + 0];

@@ -908,56 +995,62 @@

     int b_diff = b - b_sub;

     int c_diff = c - c_sub;

     int sobel = Abs(a_diff + b_diff * 2 + c_diff);

-    dst_sobely[i] = (uint8)(clamp255(sobel));

+    dst_sobely[i] = (uint8_t)(clamp255(sobel));

-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,

-                uint8* dst_argb, int width) {

+void SobelRow_C(const uint8_t* src_sobelx,

+                const uint8_t* src_sobely,

+                uint8_t* dst_argb,

+                int width) {

   int i;

   for (i = 0; i < width; ++i) {

     int r = src_sobelx[i];

     int b = src_sobely[i];

     int s = clamp255(r + b);

-    dst_argb[0] = (uint8)(s);

-    dst_argb[1] = (uint8)(s);

-    dst_argb[2] = (uint8)(s);

-    dst_argb[3] = (uint8)(255u);

+    dst_argb[0] = (uint8_t)(s);

+    dst_argb[1] = (uint8_t)(s);

+    dst_argb[2] = (uint8_t)(s);

+    dst_argb[3] = (uint8_t)(255u);

     dst_argb += 4;

-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,

-                       uint8* dst_y, int width) {

+void SobelToPlaneRow_C(const uint8_t* src_sobelx,

+                       const uint8_t* src_sobely,

+                       uint8_t* dst_y,

+                       int width) {

   int i;

   for (i = 0; i < width; ++i) {

     int r = src_sobelx[i];

     int b = src_sobely[i];

     int s = clamp255(r + b);

-    dst_y[i] = (uint8)(s);

+    dst_y[i] = (uint8_t)(s);

-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,

-                  uint8* dst_argb, int width) {

+void SobelXYRow_C(const uint8_t* src_sobelx,

+                  const uint8_t* src_sobely,

+                  uint8_t* dst_argb,

+                  int width) {

   int i;

   for (i = 0; i < width; ++i) {

     int r = src_sobelx[i];

     int b = src_sobely[i];

     int g = clamp255(r + b);

-    dst_argb[0] = (uint8)(b);

-    dst_argb[1] = (uint8)(g);

-    dst_argb[2] = (uint8)(r);

-    dst_argb[3] = (uint8)(255u);

+    dst_argb[0] = (uint8_t)(b);

+    dst_argb[1] = (uint8_t)(g);

+    dst_argb[2] = (uint8_t)(r);

+    dst_argb[3] = (uint8_t)(255u);

     dst_argb += 4;

-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {

+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {

   // Copy a Y to RGB.

   int x;

   for (x = 0; x < width; ++x) {

-    uint8 y = src_y[0];

+    uint8_t y = src_y[0];

     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;

     dst_argb[3] = 255u;

     dst_argb += 4;

@@ -974,75 +1067,69 @@

 //  B = (Y - 16) * 1.164 - U * -2.018

 // Y contribution to R,G,B.  Scale and bias.

-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */

 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

 // U and V contributions to R,G,B.

 #define UB -128 /* max(-128, round(-2.018 * 64)) */

-#define UG 25 /* round(0.391 * 64) */

-#define VG 52 /* round(0.813 * 64) */

+#define UG 25   /* round(0.391 * 64) */

+#define VG 52   /* round(0.813 * 64) */

 #define VR -102 /* round(-1.596 * 64) */

 // Bias values to subtract 16 from Y and 128 from U and V.

-#define BB (UB * 128            + YGB)

+#define BB (UB * 128 + YGB)

 #define BG (UG * 128 + VG * 128 + YGB)

-#define BR            (VR * 128 + YGB)

+#define BR (VR * 128 + YGB)

 #if defined(__aarch64__)  // 64 bit arm

 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {

-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

-  { UG, VG, UG, VG, UG, VG, UG, VG },

-  { UG, VG, UG, VG, UG, VG, UG, VG },

-  { BB, BG, BR, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},

+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},

+    {UG, VG, UG, VG, UG, VG, UG, VG},

+    {UG, VG, UG, VG, UG, VG, UG, VG},

+    {BB, BG, BR, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {

-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

-  { VG, UG, VG, UG, VG, UG, VG, UG },

-  { VG, UG, VG, UG, VG, UG, VG, UG },

-  { BR, BG, BB, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},

+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},

+    {VG, UG, VG, UG, VG, UG, VG, UG},

+    {VG, UG, VG, UG, VG, UG, VG, UG},

+    {BR, BG, BB, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 #elif defined(__arm__)  // 32 bit arm

 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {

-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { BB, BG, BR, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},

+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},

+    {BB, BG, BR, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {

-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { BR, BG, BB, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},

+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},

+    {BR, BG, BB, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 #else

 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {

-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},

+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},

+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},

+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},

+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},

+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},

+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};

 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {

-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},

+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},

+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},

+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},

+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},

+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},

+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};

 #endif

 #undef BB

@@ -1062,74 +1149,68 @@

 // Y contribution to R,G,B.  Scale and bias.

 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

-#define YGB 32  /* 64 / 2 */

+#define YGB 32   /* 64 / 2 */

 // U and V contributions to R,G,B.

 #define UB -113 /* round(-1.77200 * 64) */

-#define UG 22 /* round(0.34414 * 64) */

-#define VG 46 /* round(0.71414  * 64) */

-#define VR -90 /* round(-1.40200 * 64) */

+#define UG 22   /* round(0.34414 * 64) */

+#define VG 46   /* round(0.71414  * 64) */

+#define VR -90  /* round(-1.40200 * 64) */

 // Bias values to round, and subtract 128 from U and V.

-#define BB (UB * 128            + YGB)

+#define BB (UB * 128 + YGB)

 #define BG (UG * 128 + VG * 128 + YGB)

-#define BR            (VR * 128 + YGB)

+#define BR (VR * 128 + YGB)

 #if defined(__aarch64__)

 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {

-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

-  { UG, VG, UG, VG, UG, VG, UG, VG },

-  { UG, VG, UG, VG, UG, VG, UG, VG },

-  { BB, BG, BR, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},

+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},

+    {UG, VG, UG, VG, UG, VG, UG, VG},

+    {UG, VG, UG, VG, UG, VG, UG, VG},

+    {BB, BG, BR, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {

-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

-  { VG, UG, VG, UG, VG, UG, VG, UG },

-  { VG, UG, VG, UG, VG, UG, VG, UG },

-  { BR, BG, BB, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},

+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},

+    {VG, UG, VG, UG, VG, UG, VG, UG},

+    {VG, UG, VG, UG, VG, UG, VG, UG},

+    {BR, BG, BB, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 #elif defined(__arm__)

 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {

-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { BB, BG, BR, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},

+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},

+    {BB, BG, BR, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {

-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { BR, BG, BB, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},

+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},

+    {BR, BG, BB, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 #else

 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {

-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},

+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},

+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},

+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},

+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},

+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},

+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};

 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {

-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},

+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},

+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},

+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},

+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},

+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},

+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};

 #endif

 #undef BB

@@ -1143,81 +1224,76 @@

 #undef YG

 // BT.709 YUV to RGB reference

-// *  R = Y                - V * -1.28033

-// *  G = Y - U *  0.21482 - V *  0.38059

-// *  B = Y - U * -2.12798

+//  R = (Y - 16) * 1.164              - V * -1.793

+//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533

+//  B = (Y - 16) * 1.164 - U * -2.112

+// See also http://www.equasys.de/colorconversion.html

 // Y contribution to R,G,B.  Scale and bias.

-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */

-#define YGB 32  /* 64 / 2 */

+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */

+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

-// TODO(fbarchard): Find way to express 2.12 instead of 2.0.

+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.

 // U and V contributions to R,G,B.

-#define UB -128 /* max(-128, round(-2.12798 * 64)) */

-#define UG 14 /* round(0.21482 * 64) */

-#define VG 24 /* round(0.38059  * 64) */

-#define VR -82 /* round(-1.28033 * 64) */

+#define UB -128 /* max(-128, round(-2.112 * 64)) */

+#define UG 14   /* round(0.213 * 64) */

+#define VG 34   /* round(0.533  * 64) */

+#define VR -115 /* round(-1.793 * 64) */

 // Bias values to round, and subtract 128 from U and V.

-#define BB (UB * 128            + YGB)

+#define BB (UB * 128 + YGB)

 #define BG (UG * 128 + VG * 128 + YGB)

-#define BR            (VR * 128 + YGB)

+#define BR (VR * 128 + YGB)

 #if defined(__aarch64__)

 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {

-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },

-  { UG, VG, UG, VG, UG, VG, UG, VG },

-  { UG, VG, UG, VG, UG, VG, UG, VG },

-  { BB, BG, BR, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},

+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},

+    {UG, VG, UG, VG, UG, VG, UG, VG},

+    {UG, VG, UG, VG, UG, VG, UG, VG},

+    {BB, BG, BR, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {

-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },

-  { VG, UG, VG, UG, VG, UG, VG, UG },

-  { VG, UG, VG, UG, VG, UG, VG, UG },

-  { BR, BG, BB, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},

+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},

+    {VG, UG, VG, UG, VG, UG, VG, UG},

+    {VG, UG, VG, UG, VG, UG, VG, UG},

+    {BR, BG, BB, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 #elif defined(__arm__)

 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {

-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { BB, BG, BR, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},

+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},

+    {BB, BG, BR, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {

-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },

-  { BR, BG, BB, 0, 0, 0, 0, 0 },

-  { 0x0101 * YG, 0, 0, 0 }

-};

+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},

+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},

+    {BR, BG, BB, 0, 0, 0, 0, 0},

+    {0x0101 * YG, 0, 0, 0}};

 #else

 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {

-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },

-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },

-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,

+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},

+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,

+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},

+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,

+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},

+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},

+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},

+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},

+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};

 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {

-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },

-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },

-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },

-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },

-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },

-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },

-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }

-};

+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,

+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},

+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,

+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},

+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,

+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},

+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},

+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},

+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},

+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};

 #endif

 #undef BB

@@ -1231,8 +1307,14 @@

 #undef YG

 // C reference code that mimics the YUV assembly.

-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,

-                              uint8* b, uint8* g, uint8* r,

+// Reads 8 bit YUV and leaves result as 16 bit.

+static __inline void YuvPixel(uint8_t y,

+                              uint8_t u,

+                              uint8_t v,

+                              uint8_t* b,

+                              uint8_t* g,

+                              uint8_t* r,

                               const struct YuvConstants* yuvconstants) {

 #if defined(__aarch64__)

   int ub = -yuvconstants->kUVToRB[0];

@@ -1263,22 +1345,129 @@

   int yg = yuvconstants->kYToRgb[0];

 #endif

-  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;

-  *b = Clamp((int32)(-(u * ub)          + y1 + bb) >> 6);

-  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);

-  *r = Clamp((int32)         (-(v * vr) + y1 + br) >> 6);

+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;

+  *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);

+  *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);

+  *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);

+// Reads 8 bit YUV and leaves result as 16 bit.

+static __inline void YuvPixel8_16(uint8_t y,

+                                  uint8_t u,

+                                  uint8_t v,

+                                  int* b,

+                                  int* g,

+                                  int* r,

+                                  const struct YuvConstants* yuvconstants) {

+#if defined(__aarch64__)

+  int ub = -yuvconstants->kUVToRB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[1];

+  int vr = -yuvconstants->kUVToRB[1];

+  int bb = yuvconstants->kUVBiasBGR[0];

+  int bg = yuvconstants->kUVBiasBGR[1];

+  int br = yuvconstants->kUVBiasBGR[2];

+  int yg = yuvconstants->kYToRgb[0] / 0x0101;

+#elif defined(__arm__)

+  int ub = -yuvconstants->kUVToRB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[4];

+  int vr = -yuvconstants->kUVToRB[4];

+  int bb = yuvconstants->kUVBiasBGR[0];

+  int bg = yuvconstants->kUVBiasBGR[1];

+  int br = yuvconstants->kUVBiasBGR[2];

+  int yg = yuvconstants->kYToRgb[0] / 0x0101;

+#else

+  int ub = yuvconstants->kUVToB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[1];

+  int vr = yuvconstants->kUVToR[1];

+  int bb = yuvconstants->kUVBiasB[0];

+  int bg = yuvconstants->kUVBiasG[0];

+  int br = yuvconstants->kUVBiasR[0];

+  int yg = yuvconstants->kYToRgb[0];

+#endif

+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;

+  *b = (int)(-(u * ub) + y1 + bb);

+  *g = (int)(-(u * ug + v * vg) + y1 + bg);

+  *r = (int)(-(v * vr) + y1 + br);

+}

+// C reference code that mimics the YUV 16 bit assembly.

+// Reads 10 bit YUV and leaves result as 16 bit.

+static __inline void YuvPixel16(int16_t y,

+                                int16_t u,

+                                int16_t v,

+                                int* b,

+                                int* g,

+                                int* r,

+                                const struct YuvConstants* yuvconstants) {

+#if defined(__aarch64__)

+  int ub = -yuvconstants->kUVToRB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[1];

+  int vr = -yuvconstants->kUVToRB[1];

+  int bb = yuvconstants->kUVBiasBGR[0];

+  int bg = yuvconstants->kUVBiasBGR[1];

+  int br = yuvconstants->kUVBiasBGR[2];

+  int yg = yuvconstants->kYToRgb[0] / 0x0101;

+#elif defined(__arm__)

+  int ub = -yuvconstants->kUVToRB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[4];

+  int vr = -yuvconstants->kUVToRB[4];

+  int bb = yuvconstants->kUVBiasBGR[0];

+  int bg = yuvconstants->kUVBiasBGR[1];

+  int br = yuvconstants->kUVBiasBGR[2];

+  int yg = yuvconstants->kYToRgb[0] / 0x0101;

+#else

+  int ub = yuvconstants->kUVToB[0];

+  int ug = yuvconstants->kUVToG[0];

+  int vg = yuvconstants->kUVToG[1];

+  int vr = yuvconstants->kUVToR[1];

+  int bb = yuvconstants->kUVBiasB[0];

+  int bg = yuvconstants->kUVBiasG[0];

+  int br = yuvconstants->kUVBiasR[0];

+  int yg = yuvconstants->kYToRgb[0];

+#endif

+  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;

+  u = clamp255(u >> 2);

+  v = clamp255(v >> 2);

+  *b = (int)(-(u * ub) + y1 + bb);

+  *g = (int)(-(u * ug + v * vg) + y1 + bg);

+  *r = (int)(-(v * vr) + y1 + br);

+}

+// C reference code that mimics the YUV 10 bit assembly.

+// Reads 10 bit YUV and clamps down to 8 bit RGB.

+static __inline void YuvPixel10(uint16_t y,

+                                uint16_t u,

+                                uint16_t v,

+                                uint8_t* b,

+                                uint8_t* g,

+                                uint8_t* r,

+                                const struct YuvConstants* yuvconstants) {

+  int b16;

+  int g16;

+  int r16;

+  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);

+  *b = Clamp(b16 >> 6);

+  *g = Clamp(g16 >> 6);

+  *r = Clamp(r16 >> 6);

+}

 // Y contribution to R,G,B.  Scale and bias.

-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */

+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */

 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */

 // C reference code that mimics the YUV assembly.

-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {

-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;

-  *b = Clamp((int32)(y1 + YGB) >> 6);

-  *g = Clamp((int32)(y1 + YGB) >> 6);

-  *r = Clamp((int32)(y1 + YGB) >> 6);

+static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {

+  uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;

+  *b = Clamp((int32_t)(y1 + YGB) >> 6);

+  *g = Clamp((int32_t)(y1 + YGB) >> 6);

+  *r = Clamp((int32_t)(y1 + YGB) >> 6);

 #undef YG

@@ -1288,16 +1477,16 @@

     (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))

 // C mimic assembly.

 // TODO(fbarchard): Remove subsampling from Neon.

-void I444ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

+void I444ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;

-    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;

+    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;

+    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;

     YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,

              yuvconstants);

     rgb_buf[3] = 255;

@@ -1310,22 +1499,22 @@

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

 #else

-void I444ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

+void I444ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width; ++x) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

     src_y += 1;

     src_u += 1;

@@ -1336,19 +1525,19 @@

 #endif

 // Also used for 420

-void I422ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

+void I422ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,

+             rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_y += 2;

     src_u += 1;

@@ -1356,26 +1545,120 @@

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-void I422AlphaToARGBRow_C(const uint8* src_y,

-                          const uint8* src_u,

-                          const uint8* src_v,

-                          const uint8* src_a,

-                          uint8* rgb_buf,

+// 10 bit YUV to ARGB

+void I210ToARGBRow_C(const uint16_t* src_y,

+                     const uint16_t* src_u,

+                     const uint16_t* src_v,

+                     uint8_t* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+               rgb_buf + 2, yuvconstants);

+    rgb_buf[3] = 255;

+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,

+               rgb_buf + 6, yuvconstants);

+    rgb_buf[7] = 255;

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+               rgb_buf + 2, yuvconstants);

+    rgb_buf[3] = 255;

+  }

+}

+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {

+  uint32_t ar30;

+  b = b >> 4;  // convert 10.6 to 10 bit.

+  g = g >> 4;

+  r = r >> 4;

+  b = Clamp10(b);

+  g = Clamp10(g);

+  r = Clamp10(r);

+  ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;

+  (*(uint32_t*)rgb_buf) = ar30;

+}

+// 10 bit YUV to 10 bit AR30

+void I210ToAR30Row_C(const uint16_t* src_y,

+                     const uint16_t* src_u,

+                     const uint16_t* src_v,

+                     uint8_t* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

+                     int width) {

+  int x;

+  int b;

+  int g;

+  int r;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);

+    StoreAR30(rgb_buf, b, g, r);

+    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);

+    StoreAR30(rgb_buf + 4, b, g, r);

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);

+    StoreAR30(rgb_buf, b, g, r);

+  }

+}

+// 8 bit YUV to 10 bit AR30

+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.

+void I422ToAR30Row_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

+                     const struct YuvConstants* yuvconstants,

+                     int width) {

+  int x;

+  int b;

+  int g;

+  int r;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);

+    StoreAR30(rgb_buf, b, g, r);

+    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);

+    StoreAR30(rgb_buf + 4, b, g, r);

+    src_y += 2;

+    src_u += 1;

+    src_v += 1;

+    rgb_buf += 8;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);

+    StoreAR30(rgb_buf, b, g, r);

+  }

+}

+void I422AlphaToARGBRow_C(const uint8_t* src_y,

+                          const uint8_t* src_u,

+                          const uint8_t* src_v,

+                          const uint8_t* src_a,

+                          uint8_t* rgb_buf,

                           const struct YuvConstants* yuvconstants,

                           int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = src_a[0];

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,

+             rgb_buf + 6, yuvconstants);

     rgb_buf[7] = src_a[1];

     src_y += 2;

     src_u += 1;

@@ -1384,24 +1667,24 @@

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = src_a[0];

-void I422ToRGB24Row_C(const uint8* src_y,

-                      const uint8* src_u,

-                      const uint8* src_v,

-                      uint8* rgb_buf,

+void I422ToRGB24Row_C(const uint8_t* src_y,

+                      const uint8_t* src_u,

+                      const uint8_t* src_v,

+                      uint8_t* rgb_buf,

                       const struct YuvConstants* yuvconstants,

                       int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,

+             rgb_buf + 5, yuvconstants);

     src_y += 2;

     src_u += 1;

     src_v += 1;

@@ -1408,23 +1691,23 @@

     rgb_buf += 6;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

-void I422ToARGB4444Row_C(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb4444,

+void I422ToARGB4444Row_C(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_argb4444,

                          const struct YuvConstants* yuvconstants,

                          int width) {

-  uint8 b0;

-  uint8 g0;

-  uint8 r0;

-  uint8 b1;

-  uint8 g1;

-  uint8 r1;

+  uint8_t b0;

+  uint8_t g0;

+  uint8_t r0;

+  uint8_t b1;

+  uint8_t g1;

+  uint8_t r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

@@ -1435,8 +1718,8 @@

     b1 = b1 >> 4;

     g1 = g1 >> 4;

     r1 = r1 >> 4;

-    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |

-        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;

+    *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |

+                                 (g1 << 20) | (r1 << 24) | 0xf000f000;

     src_y += 2;

     src_u += 1;

     src_v += 1;

@@ -1447,23 +1730,22 @@

     b0 = b0 >> 4;

     g0 = g0 >> 4;

     r0 = r0 >> 4;

-    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |

-        0xf000;

+    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;

-void I422ToARGB1555Row_C(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_argb1555,

+void I422ToARGB1555Row_C(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_argb1555,

                          const struct YuvConstants* yuvconstants,

                          int width) {

-  uint8 b0;

-  uint8 g0;

-  uint8 r0;

-  uint8 b1;

-  uint8 g1;

-  uint8 r1;

+  uint8_t b0;

+  uint8_t g0;

+  uint8_t r0;

+  uint8_t b1;

+  uint8_t g1;

+  uint8_t r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

@@ -1474,8 +1756,8 @@

     b1 = b1 >> 3;

     g1 = g1 >> 3;

     r1 = r1 >> 3;

-    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |

-        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;

+    *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |

+                                 (g1 << 21) | (r1 << 26) | 0x80008000;

     src_y += 2;

     src_u += 1;

     src_v += 1;

@@ -1486,23 +1768,22 @@

     b0 = b0 >> 3;

     g0 = g0 >> 3;

     r0 = r0 >> 3;

-    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |

-        0x8000;

+    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;

-void I422ToRGB565Row_C(const uint8* src_y,

-                       const uint8* src_u,

-                       const uint8* src_v,

-                       uint8* dst_rgb565,

+void I422ToRGB565Row_C(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_rgb565,

                        const struct YuvConstants* yuvconstants,

                        int width) {

-  uint8 b0;

-  uint8 g0;

-  uint8 r0;

-  uint8 b1;

-  uint8 g1;

-  uint8 r1;

+  uint8_t b0;

+  uint8_t g0;

+  uint8_t r0;

+  uint8_t b1;

+  uint8_t g1;

+  uint8_t r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);

@@ -1513,8 +1794,8 @@

     b1 = b1 >> 3;

     g1 = g1 >> 2;

     r1 = r1 >> 3;

-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |

-        (b1 << 16) | (g1 << 21) | (r1 << 27);

+    *(uint32_t*)(dst_rgb565) =

+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);

     src_y += 2;

     src_u += 1;

     src_v += 1;

@@ -1525,111 +1806,111 @@

     b0 = b0 >> 3;

     g0 = g0 >> 2;

     r0 = r0 >> 3;

-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

-void I411ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

+void NV12ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_uv,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

-  for (x = 0; x < width - 3; x += 4) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,

+             rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

-    YuvPixel(src_y[2], src_u[0], src_v[0],

-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);

-    rgb_buf[11] = 255;

-    YuvPixel(src_y[3], src_u[0], src_v[0],

-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);

-    rgb_buf[15] = 255;

-    src_y += 4;

-    src_u += 1;

-    src_v += 1;

-    rgb_buf += 16;  // Advance 4 pixels.

-  }

-  if (width & 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

-    rgb_buf[3] = 255;

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

-    rgb_buf[7] = 255;

     src_y += 2;

+    src_uv += 2;

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-void NV12ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_uv,

-                     uint8* rgb_buf,

+void NV21ToARGBRow_C(const uint8_t* src_y,

+                     const uint8_t* src_vu,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_uv[0], src_uv[1],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-    YuvPixel(src_y[1], src_uv[0], src_uv[1],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,

+             rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_y += 2;

-    src_uv += 2;

+    src_vu += 2;

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_uv[0], src_uv[1],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-void NV21ToARGBRow_C(const uint8* src_y,

-                     const uint8* src_vu,

-                     uint8* rgb_buf,

-                     const struct YuvConstants* yuvconstants,

-                     int width) {

+void NV12ToRGB24Row_C(const uint8_t* src_y,

+                      const uint8_t* src_uv,

+                      uint8_t* rgb_buf,

+                      const struct YuvConstants* yuvconstants,

+                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_vu[1], src_vu[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

-    rgb_buf[3] = 255;

-    YuvPixel(src_y[1], src_vu[1], src_vu[0],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

-    rgb_buf[7] = 255;

+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,

+             rgb_buf + 5, yuvconstants);

     src_y += 2;

+    src_uv += 2;

+    rgb_buf += 6;  // Advance 2 pixels.

+  }

+  if (width & 1) {

+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

+  }

+}

+void NV21ToRGB24Row_C(const uint8_t* src_y,

+                      const uint8_t* src_vu,

+                      uint8_t* rgb_buf,

+                      const struct YuvConstants* yuvconstants,

+                      int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,

+             rgb_buf + 5, yuvconstants);

+    src_y += 2;

     src_vu += 2;

-    rgb_buf += 8;  // Advance 2 pixels.

+    rgb_buf += 6;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_vu[1], src_vu[0],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

-    rgb_buf[3] = 255;

+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

-void NV12ToRGB565Row_C(const uint8* src_y,

-                       const uint8* src_uv,

-                       uint8* dst_rgb565,

+void NV12ToRGB565Row_C(const uint8_t* src_y,

+                       const uint8_t* src_uv,

+                       uint8_t* dst_rgb565,

                        const struct YuvConstants* yuvconstants,

                        int width) {

-  uint8 b0;

-  uint8 g0;

-  uint8 r0;

-  uint8 b1;

-  uint8 g1;

-  uint8 r1;

+  uint8_t b0;

+  uint8_t g0;

+  uint8_t r0;

+  uint8_t b1;

+  uint8_t g1;

+  uint8_t r1;

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);

@@ -1640,8 +1921,8 @@

     b1 = b1 >> 3;

     g1 = g1 >> 2;

     r1 = r1 >> 3;

-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |

-        (b1 << 16) | (g1 << 21) | (r1 << 27);

+    *(uint32_t*)(dst_rgb565) =

+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);

     src_y += 2;

     src_uv += 2;

     dst_rgb565 += 4;  // Advance 2 pixels.

@@ -1651,67 +1932,67 @@

     b0 = b0 >> 3;

     g0 = g0 >> 2;

     r0 = r0 >> 3;

-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);

-void YUY2ToARGBRow_C(const uint8* src_yuy2,

-                     uint8* rgb_buf,

+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,

+             rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_yuy2 += 4;

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-void UYVYToARGBRow_C(const uint8* src_uyvy,

-                     uint8* rgb_buf,

+void UYVYToARGBRow_C(const uint8_t* src_uyvy,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],

-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);

+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,

+             rgb_buf + 6, yuvconstants);

     rgb_buf[7] = 255;

     src_uyvy += 4;

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],

-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);

+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,

+             rgb_buf + 2, yuvconstants);

     rgb_buf[3] = 255;

-void I422ToRGBARow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* rgb_buf,

+void I422ToRGBARow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* rgb_buf,

                      const struct YuvConstants* yuvconstants,

                      int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,

+             rgb_buf + 3, yuvconstants);

     rgb_buf[0] = 255;

-    YuvPixel(src_y[1], src_u[0], src_v[0],

-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);

+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,

+             rgb_buf + 7, yuvconstants);

     rgb_buf[4] = 255;

     src_y += 2;

     src_u += 1;

@@ -1719,13 +2000,13 @@

     rgb_buf += 8;  // Advance 2 pixels.

   if (width & 1) {

-    YuvPixel(src_y[0], src_u[0], src_v[0],

-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);

+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,

+             rgb_buf + 3, yuvconstants);

     rgb_buf[0] = 255;

-void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {

+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);

@@ -1741,7 +2022,7 @@

-void MirrorRow_C(const uint8* src, uint8* dst, int width) {

+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {

   int x;

   src += width - 1;

   for (x = 0; x < width - 1; x += 2) {

@@ -1754,7 +2035,10 @@

-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {

+void MirrorUVRow_C(const uint8_t* src_uv,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width) {

   int x;

   src_uv += (width - 1) << 1;

   for (x = 0; x < width - 1; x += 2) {

@@ -1770,10 +2054,10 @@

-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {

+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {

   int x;

-  const uint32* src32 = (const uint32*)(src);

-  uint32* dst32 = (uint32*)(dst);

+  const uint32_t* src32 = (const uint32_t*)(src);

+  uint32_t* dst32 = (uint32_t*)(dst);

   src32 += width - 1;

   for (x = 0; x < width - 1; x += 2) {

     dst32[x] = src32[0];

@@ -1785,7 +2069,10 @@

-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {

+void SplitUVRow_C(const uint8_t* src_uv,

+                  uint8_t* dst_u,

+                  uint8_t* dst_v,

+                  int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     dst_u[x] = src_uv[0];

@@ -1800,7 +2087,9 @@

-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_C(const uint8_t* src_u,

+                  const uint8_t* src_v,

+                  uint8_t* dst_uv,

                   int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

@@ -1816,20 +2105,110 @@

-void CopyRow_C(const uint8* src, uint8* dst, int count) {

+void SplitRGBRow_C(const uint8_t* src_rgb,

+                   uint8_t* dst_r,

+                   uint8_t* dst_g,

+                   uint8_t* dst_b,

+                   int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    dst_r[x] = src_rgb[0];

+    dst_g[x] = src_rgb[1];

+    dst_b[x] = src_rgb[2];

+    src_rgb += 3;

+  }

+}

+void MergeRGBRow_C(const uint8_t* src_r,

+                   const uint8_t* src_g,

+                   const uint8_t* src_b,

+                   uint8_t* dst_rgb,

+                   int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    dst_rgb[0] = src_r[x];

+    dst_rgb[1] = src_g[x];

+    dst_rgb[2] = src_b[x];

+    dst_rgb += 3;

+  }

+}

+// Use scale to convert lsb formats to msb, depending how many bits there are:

+// 128 = 9 bits

+// 64 = 10 bits

+// 16 = 12 bits

+// 1 = 16 bits

+void MergeUVRow_16_C(const uint16_t* src_u,

+                     const uint16_t* src_v,

+                     uint16_t* dst_uv,

+                     int scale,

+                     int width) {

+  int x;

+  for (x = 0; x < width - 1; x += 2) {

+    dst_uv[0] = src_u[x] * scale;

+    dst_uv[1] = src_v[x] * scale;

+    dst_uv[2] = src_u[x + 1] * scale;

+    dst_uv[3] = src_v[x + 1] * scale;

+    dst_uv += 4;

+  }

+  if (width & 1) {

+    dst_uv[0] = src_u[width - 1] * scale;

+    dst_uv[1] = src_v[width - 1] * scale;

+  }

+}

+void MultiplyRow_16_C(const uint16_t* src_y,

+                      uint16_t* dst_y,

+                      int scale,

+                      int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    dst_y[x] = src_y[x] * scale;

+  }

+}

+// Use scale to convert lsb formats to msb, depending how many bits there are:

+// 32768 = 9 bits

+// 16384 = 10 bits

+// 4096 = 12 bits

+// 256 = 16 bits

+void Convert16To8Row_C(const uint16_t* src_y,

+                       uint8_t* dst_y,

+                       int scale,

+                       int width) {

+  int x;

+  for (x = 0; x < width; ++x) {

+    dst_y[x] = clamp255((src_y[x] * scale) >> 16);

+  }

+}

+// Use scale to convert lsb formats to msb, depending how many bits there are:

+// 1024 = 10 bits

+void Convert8To16Row_C(const uint8_t* src_y,

+                       uint16_t* dst_y,

+                       int scale,

+                       int width) {

+  int x;

+  scale *= 0x0101;  // replicates the byte.

+  for (x = 0; x < width; ++x) {

+    dst_y[x] = (src_y[x] * scale) >> 16;

+  }

+}

+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {

   memcpy(dst, src, count);

-void CopyRow_16_C(const uint16* src, uint16* dst, int count) {

+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {

   memcpy(dst, src, count * 2);

-void SetRow_C(uint8* dst, uint8 v8, int width) {

+void SetRow_C(uint8_t* dst, uint8_t v8, int width) {

   memset(dst, v8, width);

-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {

-  uint32* d = (uint32*)(dst_argb);

+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {

+  uint32_t* d = (uint32_t*)(dst_argb);

   int x;

   for (x = 0; x < width; ++x) {

     d[x] = v32;

@@ -1837,8 +2216,11 @@

 // Filter 2 rows of YUY2 UV's (422) into U and V (420).

-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,

-                   uint8* dst_u, uint8* dst_v, int width) {

+void YUY2ToUVRow_C(const uint8_t* src_yuy2,

+                   int src_stride_yuy2,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width) {

   // Output a row of UV values, filtering 2 rows of YUY2.

   int x;

   for (x = 0; x < width; x += 2) {

@@ -1851,8 +2233,10 @@

 // Copy row of YUY2 UV's (422) into U and V (422).

-void YUY2ToUV422Row_C(const uint8* src_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

   // Output a row of UV values.

   int x;

   for (x = 0; x < width; x += 2) {

@@ -1865,7 +2249,7 @@

 // Copy row of YUY2 Y's (422) into Y (420/422).

-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {

+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {

   // Output a row of Y values.

   int x;

   for (x = 0; x < width - 1; x += 2) {

@@ -1879,8 +2263,11 @@

 // Filter 2 rows of UYVY UV's (422) into U and V (420).

-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,

-                   uint8* dst_u, uint8* dst_v, int width) {

+void UYVYToUVRow_C(const uint8_t* src_uyvy,

+                   int src_stride_uyvy,

+                   uint8_t* dst_u,

+                   uint8_t* dst_v,

+                   int width) {

   // Output a row of UV values.

   int x;

   for (x = 0; x < width; x += 2) {

@@ -1893,8 +2280,10 @@

 // Copy row of UYVY UV's (422) into U and V (422).

-void UYVYToUV422Row_C(const uint8* src_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void UYVYToUV422Row_C(const uint8_t* src_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

   // Output a row of UV values.

   int x;

   for (x = 0; x < width; x += 2) {

@@ -1907,7 +2296,7 @@

 // Copy row of UYVY Y's (422) into Y (420/422).

-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {

+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {

   // Output a row of Y values.

   int x;

   for (x = 0; x < width - 1; x += 2) {

@@ -1925,17 +2314,19 @@

 // Blend src_argb0 over src_argb1 and store to dst_argb.

 // dst_argb may be src_argb0 or src_argb1.

 // This code mimics the SSSE3 version for better testability.

-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,

-                    uint8* dst_argb, int width) {

+void ARGBBlendRow_C(const uint8_t* src_argb0,

+                    const uint8_t* src_argb1,

+                    uint8_t* dst_argb,

+                    int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

-    uint32 fb = src_argb0[0];

-    uint32 fg = src_argb0[1];

-    uint32 fr = src_argb0[2];

-    uint32 a = src_argb0[3];

-    uint32 bb = src_argb1[0];

-    uint32 bg = src_argb1[1];

-    uint32 br = src_argb1[2];

+    uint32_t fb = src_argb0[0];

+    uint32_t fg = src_argb0[1];

+    uint32_t fr = src_argb0[2];

+    uint32_t a = src_argb0[3];

+    uint32_t bb = src_argb1[0];

+    uint32_t bg = src_argb1[1];

+    uint32_t br = src_argb1[2];

     dst_argb[0] = BLEND(fb, bb, a);

     dst_argb[1] = BLEND(fg, bg, a);

     dst_argb[2] = BLEND(fr, br, a);

@@ -1958,13 +2349,13 @@

   if (width & 1) {

-    uint32 fb = src_argb0[0];

-    uint32 fg = src_argb0[1];

-    uint32 fr = src_argb0[2];

-    uint32 a = src_argb0[3];

-    uint32 bb = src_argb1[0];

-    uint32 bg = src_argb1[1];

-    uint32 br = src_argb1[2];

+    uint32_t fb = src_argb0[0];

+    uint32_t fg = src_argb0[1];

+    uint32_t fr = src_argb0[2];

+    uint32_t a = src_argb0[3];

+    uint32_t bb = src_argb1[0];

+    uint32_t bg = src_argb1[1];

+    uint32_t br = src_argb1[2];

     dst_argb[0] = BLEND(fb, bb, a);

     dst_argb[1] = BLEND(fg, bg, a);

     dst_argb[2] = BLEND(fr, br, a);

@@ -1973,9 +2364,12 @@

 #undef BLEND

-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8

-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,

-                     const uint8* alpha, uint8* dst, int width) {

+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8

+void BlendPlaneRow_C(const uint8_t* src0,

+                     const uint8_t* src1,

+                     const uint8_t* alpha,

+                     uint8_t* dst,

+                     int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);

@@ -1995,13 +2389,13 @@

 // Multiply source RGB by alpha and store to destination.

 // This code mimics the SSSE3 version for better testability.

-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {

+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {

   int i;

   for (i = 0; i < width - 1; i += 2) {

-    uint32 b = src_argb[0];

-    uint32 g = src_argb[1];

-    uint32 r = src_argb[2];

-    uint32 a = src_argb[3];

+    uint32_t b = src_argb[0];

+    uint32_t g = src_argb[1];

+    uint32_t r = src_argb[2];

+    uint32_t a = src_argb[3];

     dst_argb[0] = ATTENUATE(b, a);

     dst_argb[1] = ATTENUATE(g, a);

     dst_argb[2] = ATTENUATE(r, a);

@@ -2019,10 +2413,10 @@

   if (width & 1) {

-    const uint32 b = src_argb[0];

-    const uint32 g = src_argb[1];

-    const uint32 r = src_argb[2];

-    const uint32 a = src_argb[3];

+    const uint32_t b = src_argb[0];

+    const uint32_t g = src_argb[1];

+    const uint32_t r = src_argb[2];

+    const uint32_t a = src_argb[3];

     dst_argb[0] = ATTENUATE(b, a);

     dst_argb[1] = ATTENUATE(g, a);

     dst_argb[2] = ATTENUATE(r, a);

@@ -2038,49 +2432,56 @@

 // Reciprocal method is off by 1 on some values. ie 125

 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.

 #define T(a) 0x01000000 + (0x10000 / a)

-const uint32 fixed_invtbl8[256] = {

-  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),

-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),

-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),

-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),

-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),

-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),

-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),

-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),

-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),

-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),

-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),

-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),

-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),

-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),

-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),

-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),

-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),

-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),

-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),

-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),

-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),

-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),

-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),

-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),

-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),

-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),

-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),

-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),

-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),

-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),

-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),

-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };

+const uint32_t fixed_invtbl8[256] = {

+    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),

+    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),

+    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),

+    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),

+    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),

+    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),

+    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),

+    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),

+    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),

+    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),

+    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),

+    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),

+    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),

+    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),

+    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),

+    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),

+    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),

+    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),

+    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),

+    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),

+    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),

+    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),

+    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),

+    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),

+    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),

+    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),

+    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),

+    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),

+    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),

+    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),

+    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),

+    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),

+    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),

+    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),

+    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),

+    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),

+    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};

 #undef T

-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {

+void ARGBUnattenuateRow_C(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          int width) {

   int i;

   for (i = 0; i < width; ++i) {

-    uint32 b = src_argb[0];

-    uint32 g = src_argb[1];

-    uint32 r = src_argb[2];

-    const uint32 a = src_argb[3];

-    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point

+    uint32_t b = src_argb[0];

+    uint32_t g = src_argb[1];

+    uint32_t r = src_argb[2];

+    const uint32_t a = src_argb[3];

+    const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point

     b = (b * ia) >> 8;

     g = (g * ia) >> 8;

     r = (r * ia) >> 8;

@@ -2094,9 +2495,11 @@

-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,

-                               const int32* previous_cumsum, int width) {

-  int32 row_sum[4] = {0, 0, 0, 0};

+void ComputeCumulativeSumRow_C(const uint8_t* row,

+                               int32_t* cumsum,

+                               const int32_t* previous_cumsum,

+                               int width) {

+  int32_t row_sum[4] = {0, 0, 0, 0};

   int x;

   for (x = 0; x < width; ++x) {

     row_sum[0] += row[x * 4 + 0];

@@ -2103,22 +2506,26 @@

     row_sum[1] += row[x * 4 + 1];

     row_sum[2] += row[x * 4 + 2];

     row_sum[3] += row[x * 4 + 3];

-    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];

-    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];

-    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];

-    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];

+    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];

+    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];

+    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];

+    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];

-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,

-                                int w, int area, uint8* dst, int count) {

+void CumulativeSumToAverageRow_C(const int32_t* tl,

+                                 const int32_t* bl,

+                                 int w,

+                                 int area,

+                                 uint8_t* dst,

+                                 int count) {

   float ooa = 1.0f / area;

   int i;

   for (i = 0; i < count; ++i) {

-    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);

-    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);

-    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);

-    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);

+    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);

+    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);

+    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);

+    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);

     dst += 4;

     tl += 4;

     bl += 4;

@@ -2127,8 +2534,11 @@

 // Copy pixels from rotated source to destination row with a slope.

 LIBYUV_API

-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,

-                     uint8* dst_argb, const float* uv_dudv, int width) {

+void ARGBAffineRow_C(const uint8_t* src_argb,

+                     int src_argb_stride,

+                     uint8_t* dst_argb,

+                     const float* uv_dudv,

+                     int width) {

   int i;

   // Render a row of pixels from source into a buffer.

   float uv[2];

@@ -2137,9 +2547,8 @@

   for (i = 0; i < width; ++i) {

     int x = (int)(uv[0]);

     int y = (int)(uv[1]);

-    *(uint32*)(dst_argb) =

-        *(const uint32*)(src_argb + y * src_argb_stride +

-                                         x * 4);

+    *(uint32_t*)(dst_argb) =

+        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);

     dst_argb += 4;

     uv[0] += uv_dudv[2];

     uv[1] += uv_dudv[3];

@@ -2147,8 +2556,10 @@

 // Blend 2 rows into 1.

-static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,

-                      uint8* dst_uv, int width) {

+static void HalfRow_C(const uint8_t* src_uv,

+                      ptrdiff_t src_uv_stride,

+                      uint8_t* dst_uv,

+                      int width) {

   int x;

   for (x = 0; x < width; ++x) {

     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;

@@ -2155,8 +2566,10 @@

-static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,

-                         uint16* dst_uv, int width) {

+static void HalfRow_16_C(const uint16_t* src_uv,

+                         ptrdiff_t src_uv_stride,

+                         uint16_t* dst_uv,

+                         int width) {

   int x;

   for (x = 0; x < width; ++x) {

     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;

@@ -2164,12 +2577,14 @@

 // C version 2x2 -> 2x1.

-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,

+void InterpolateRow_C(uint8_t* dst_ptr,

+                      const uint8_t* src_ptr,

                       ptrdiff_t src_stride,

-                      int width, int source_y_fraction) {

+                      int width,

+                      int source_y_fraction) {

   int y1_fraction = source_y_fraction;

   int y0_fraction = 256 - y1_fraction;

-  const uint8* src_ptr1 = src_ptr + src_stride;

+  const uint8_t* src_ptr1 = src_ptr + src_stride;

   int x;

   if (y1_fraction == 0) {

     memcpy(dst_ptr, src_ptr, width);

@@ -2194,12 +2609,14 @@

-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,

+void InterpolateRow_16_C(uint16_t* dst_ptr,

+                         const uint16_t* src_ptr,

                          ptrdiff_t src_stride,

-                         int width, int source_y_fraction) {

+                         int width,

+                         int source_y_fraction) {

   int y1_fraction = source_y_fraction;

   int y0_fraction = 256 - y1_fraction;

-  const uint16* src_ptr1 = src_ptr + src_stride;

+  const uint16_t* src_ptr1 = src_ptr + src_stride;

   int x;

   if (source_y_fraction == 0) {

     memcpy(dst_ptr, src_ptr, width * 2);

@@ -2222,8 +2639,10 @@

 // Use first 4 shuffler values to reorder ARGB channels.

-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,

-                      const uint8* shuffler, int width) {

+void ARGBShuffleRow_C(const uint8_t* src_argb,

+                      uint8_t* dst_argb,

+                      const uint8_t* shuffler,

+                      int width) {

   int index0 = shuffler[0];

   int index1 = shuffler[1];

   int index2 = shuffler[2];

@@ -2232,10 +2651,10 @@

   int x;

   for (x = 0; x < width; ++x) {

     // To support in-place conversion.

-    uint8 b = src_argb[index0];

-    uint8 g = src_argb[index1];

-    uint8 r = src_argb[index2];

-    uint8 a = src_argb[index3];

+    uint8_t b = src_argb[index0];

+    uint8_t g = src_argb[index1];

+    uint8_t r = src_argb[index2];

+    uint8_t a = src_argb[index3];

     dst_argb[0] = b;

     dst_argb[1] = g;

     dst_argb[2] = r;

@@ -2245,10 +2664,11 @@

-void I422ToYUY2Row_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_frame, int width) {

+void I422ToYUY2Row_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_frame,

+                     int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     dst_frame[0] = src_y[0];

@@ -2268,10 +2688,11 @@

-void I422ToUYVYRow_C(const uint8* src_y,

-                     const uint8* src_u,

-                     const uint8* src_v,

-                     uint8* dst_frame, int width) {

+void I422ToUYVYRow_C(const uint8_t* src_y,

+                     const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_frame,

+                     int width) {

   int x;

   for (x = 0; x < width - 1; x += 2) {

     dst_frame[0] = src_u[0];

@@ -2291,9 +2712,8 @@

-void ARGBPolynomialRow_C(const uint8* src_argb,

-                         uint8* dst_argb,

+void ARGBPolynomialRow_C(const uint8_t* src_argb,

+                         uint8_t* dst_argb,

                          const float* poly,

                          int width) {

   int i;

@@ -2323,33 +2743,75 @@

     dr += poly[14] * r3;

     da += poly[15] * a3;

-    dst_argb[0] = Clamp((int32)(db));

-    dst_argb[1] = Clamp((int32)(dg));

-    dst_argb[2] = Clamp((int32)(dr));

-    dst_argb[3] = Clamp((int32)(da));

+    dst_argb[0] = Clamp((int32_t)(db));

+    dst_argb[1] = Clamp((int32_t)(dg));

+    dst_argb[2] = Clamp((int32_t)(dr));

+    dst_argb[3] = Clamp((int32_t)(da));

     src_argb += 4;

     dst_argb += 4;

-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,

-                             const uint8* luma, uint32 lumacoeff) {

-  uint32 bc = lumacoeff & 0xff;

-  uint32 gc = (lumacoeff >> 8) & 0xff;

-  uint32 rc = (lumacoeff >> 16) & 0xff;

+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor

+// adjust the source integer range to the half float range desired.

+// This magic constant is 2^-112. Multiplying by this

+// is the same as subtracting 112 from the exponent, which

+// is the difference in exponent bias between 32-bit and

+// 16-bit floats. Once we've done this subtraction, we can

+// simply extract the low bits of the exponent and the high

+// bits of the mantissa from our float and we're done.

+// Work around GCC 7 punning warning -Wstrict-aliasing

+#if defined(__GNUC__)

+typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;

+#else

+typedef uint32_t uint32_alias_t;

+#endif

+void HalfFloatRow_C(const uint16_t* src,

+                    uint16_t* dst,

+                    float scale,

+                    int width) {

   int i;

+  float mult = 1.9259299444e-34f * scale;

+  for (i = 0; i < width; ++i) {

+    float value = src[i] * mult;

+    dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);

+  }

+}

+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    float value = src[i] * scale;

+    dst[i] = value;

+  }

+}

+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

+                             int width,

+                             const uint8_t* luma,

+                             uint32_t lumacoeff) {

+  uint32_t bc = lumacoeff & 0xff;

+  uint32_t gc = (lumacoeff >> 8) & 0xff;

+  uint32_t rc = (lumacoeff >> 16) & 0xff;

+  int i;

   for (i = 0; i < width - 1; i += 2) {

     // Luminance in rows, color values in columns.

-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

-                           src_argb[2] * rc) & 0x7F00u) + luma;

-    const uint8* luma1;

+    const uint8_t* luma0 =

+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +

+        luma;

+    const uint8_t* luma1;

     dst_argb[0] = luma0[src_argb[0]];

     dst_argb[1] = luma0[src_argb[1]];

     dst_argb[2] = luma0[src_argb[2]];

     dst_argb[3] = src_argb[3];

-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +

-              src_argb[6] * rc) & 0x7F00u) + luma;

+    luma1 =

+        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +

+        luma;

     dst_argb[4] = luma1[src_argb[4]];

     dst_argb[5] = luma1[src_argb[5]];

     dst_argb[6] = luma1[src_argb[6]];

@@ -2359,8 +2821,9 @@

   if (width & 1) {

     // Luminance in rows, color values in columns.

-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +

-                           src_argb[2] * rc) & 0x7F00u) + luma;

+    const uint8_t* luma0 =

+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +

+        luma;

     dst_argb[0] = luma0[src_argb[0]];

     dst_argb[1] = luma0[src_argb[1]];

     dst_argb[2] = luma0[src_argb[2]];

@@ -2368,7 +2831,7 @@

-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {

+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {

   int i;

   for (i = 0; i < width - 1; i += 2) {

     dst[3] = src[3];

@@ -2381,7 +2844,7 @@

-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {

+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {

   int i;

   for (i = 0; i < width - 1; i += 2) {

     dst_a[0] = src_argb[3];

@@ -2394,7 +2857,7 @@

-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {

+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {

   int i;

   for (i = 0; i < width - 1; i += 2) {

     dst[3] = src[0];

@@ -2413,13 +2876,13 @@

 #if !(defined(_MSC_VER) && defined(_M_IX86)) && \

     defined(HAS_I422TORGB565ROW_SSSE3)

 // row_win.cc has asm version, but GCC uses 2 step wrapper.

-void I422ToRGB565Row_SSSE3(const uint8* src_y,

-                           const uint8* src_u,

-                           const uint8* src_v,

-                           uint8* dst_rgb565,

+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,

+                           const uint8_t* src_u,

+                           const uint8_t* src_v,

+                           uint8_t* dst_rgb565,

                            const struct YuvConstants* yuvconstants,

                            int width) {

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);

@@ -2434,14 +2897,14 @@

 #endif

 #if defined(HAS_I422TOARGB1555ROW_SSSE3)

-void I422ToARGB1555Row_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb1555,

+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,

+                             const uint8_t* src_u,

+                             const uint8_t* src_v,

+                             uint8_t* dst_argb1555,

                              const struct YuvConstants* yuvconstants,

                              int width) {

   // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);

@@ -2456,14 +2919,14 @@

 #endif

 #if defined(HAS_I422TOARGB4444ROW_SSSE3)

-void I422ToARGB4444Row_SSSE3(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             uint8* dst_argb4444,

+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,

+                             const uint8_t* src_u,

+                             const uint8_t* src_v,

+                             uint8_t* dst_argb4444,

                              const struct YuvConstants* yuvconstants,

                              int width) {

   // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);

@@ -2478,13 +2941,13 @@

 #endif

 #if defined(HAS_NV12TORGB565ROW_SSSE3)

-void NV12ToRGB565Row_SSSE3(const uint8* src_y,

-                           const uint8* src_uv,

-                           uint8* dst_rgb565,

+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,

+                           const uint8_t* src_uv,

+                           uint8_t* dst_rgb565,

                            const struct YuvConstants* yuvconstants,

                            int width) {

   // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);

@@ -2497,14 +2960,102 @@

 #endif

+#if defined(HAS_NV12TORGB24ROW_SSSE3)

+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,

+                          const uint8_t* src_uv,

+                          uint8_t* dst_rgb24,

+                          const struct YuvConstants* yuvconstants,

+                          int width) {

+  // Row buffer for intermediate ARGB pixels.

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

+  while (width > 0) {

+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);

+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);

+    src_y += twidth;

+    src_uv += twidth;

+    dst_rgb24 += twidth * 3;

+    width -= twidth;

+  }

+}

+#endif

+#if defined(HAS_NV21TORGB24ROW_SSSE3)

+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,

+                          const uint8_t* src_vu,

+                          uint8_t* dst_rgb24,

+                          const struct YuvConstants* yuvconstants,

+                          int width) {

+  // Row buffer for intermediate ARGB pixels.

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

+  while (width > 0) {

+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);

+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);

+    src_y += twidth;

+    src_vu += twidth;

+    dst_rgb24 += twidth * 3;

+    width -= twidth;

+  }

+}

+#endif

+#if defined(HAS_NV12TORGB24ROW_AVX2)

+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,

+                         const uint8_t* src_uv,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

+  // Row buffer for intermediate ARGB pixels.

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

+  while (width > 0) {

+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);

+#if defined(HAS_ARGBTORGB24ROW_AVX2)

+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);

+#else

+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);

+#endif

+    src_y += twidth;

+    src_uv += twidth;

+    dst_rgb24 += twidth * 3;

+    width -= twidth;

+  }

+}

+#endif

+#if defined(HAS_NV21TORGB24ROW_AVX2)

+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,

+                         const uint8_t* src_vu,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

+  // Row buffer for intermediate ARGB pixels.

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

+  while (width > 0) {

+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

+    NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);

+#if defined(HAS_ARGBTORGB24ROW_AVX2)

+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);

+#else

+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);

+#endif

+    src_y += twidth;

+    src_vu += twidth;

+    dst_rgb24 += twidth * 3;

+    width -= twidth;

+  }

+}

+#endif

 #if defined(HAS_I422TORGB565ROW_AVX2)

-void I422ToRGB565Row_AVX2(const uint8* src_y,

-                          const uint8* src_u,

-                          const uint8* src_v,

-                          uint8* dst_rgb565,

+void I422ToRGB565Row_AVX2(const uint8_t* src_y,

+                          const uint8_t* src_u,

+                          const uint8_t* src_v,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width) {

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

@@ -2523,14 +3074,14 @@

 #endif

 #if defined(HAS_I422TOARGB1555ROW_AVX2)

-void I422ToARGB1555Row_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb1555,

+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb1555,

                             const struct YuvConstants* yuvconstants,

                             int width) {

   // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

@@ -2549,14 +3100,14 @@

 #endif

 #if defined(HAS_I422TOARGB4444ROW_AVX2)

-void I422ToARGB4444Row_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb4444,

+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb4444,

                             const struct YuvConstants* yuvconstants,

                             int width) {

   // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

@@ -2575,19 +3126,22 @@

 #endif

 #if defined(HAS_I422TORGB24ROW_AVX2)

-void I422ToRGB24Row_AVX2(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_rgb24,

-                            const struct YuvConstants* yuvconstants,

-                            int width) {

+void I422ToRGB24Row_AVX2(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

   // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);

-    // TODO(fbarchard): ARGBToRGB24Row_AVX2

+#if defined(HAS_ARGBTORGB24ROW_AVX2)

+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);

+#else

     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);

+#endif

     src_y += twidth;

     src_u += twidth / 2;

     src_v += twidth / 2;

@@ -2598,13 +3152,13 @@

 #endif

 #if defined(HAS_NV12TORGB565ROW_AVX2)

-void NV12ToRGB565Row_AVX2(const uint8* src_y,

-                          const uint8* src_uv,

-                          uint8* dst_rgb565,

+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,

+                          const uint8_t* src_uv,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width) {

   // Row buffer for intermediate ARGB pixels.

-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);

+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);

   while (width > 0) {

     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;

     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);

@@ -2620,6 +3174,62 @@

 #endif

+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {

+  float fsum = 0.f;

+  int i;

+#if defined(__clang__)

+#pragma clang loop vectorize_width(4)

+#endif

+  for (i = 0; i < width; ++i) {

+    float v = *src++;

+    fsum += v * v;

+    *dst++ = v * scale;

+  }

+  return fsum;

+}

+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {

+  float fmax = 0.f;

+  int i;

+  for (i = 0; i < width; ++i) {

+    float v = *src++;

+    float vs = v * scale;

+    fmax = (v > fmax) ? v : fmax;

+    *dst++ = vs;

+  }

+  return fmax;

+}

+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    *dst++ = *src++ * scale;

+  }

+}

+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    *dst++ =

+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;

+    ++src;

+  }

+}

+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.

+void GaussCol_C(const uint16_t* src0,

+                const uint16_t* src1,

+                const uint16_t* src2,

+                const uint16_t* src3,

+                const uint16_t* src4,

+                uint32_t* dst,

+                int width) {

+  int i;

+  for (i = 0; i < width; ++i) {

+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;

+  }

+}

 #ifdef __cplusplus

 }  // extern "C"

--- a/third_party/libyuv/source/row_gcc.cc

+++ b/third_party/libyuv/source/row_gcc.cc

@@ -1,4 +1,3 @@

-// VERSION 2

/*

  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.

@@ -23,748 +22,1052 @@

 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

 // Constants for ARGB

-static vec8 kARGBToY = {

-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0

-};

+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,

+                              13, 65, 33, 0, 13, 65, 33, 0};

 // JPeg full range.

-static vec8 kARGBToYJ = {

-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0

-};

+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,

+                               15, 75, 38, 0, 15, 75, 38, 0};

 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)

 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)

-static vec8 kARGBToU = {

-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0

-};

+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,

+                              112, -74, -38, 0, 112, -74, -38, 0};

-static vec8 kARGBToUJ = {

-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0

-};

+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,

+                               127, -84, -43, 0, 127, -84, -43, 0};

-static vec8 kARGBToV = {

-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,

-};

+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,

+                              -18, -94, 112, 0, -18, -94, 112, 0};

-static vec8 kARGBToVJ = {

-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0

-};

+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,

+                               -20, -107, 127, 0, -20, -107, 127, 0};

 // Constants for BGRA

-static vec8 kBGRAToY = {

-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13

-};

+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,

+                              0, 33, 65, 13, 0, 33, 65, 13};

-static vec8 kBGRAToU = {

-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112

-};

+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,

+                              0, -38, -74, 112, 0, -38, -74, 112};

-static vec8 kBGRAToV = {

-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18

-};

+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,

+                              0, 112, -94, -18, 0, 112, -94, -18};

 // Constants for ABGR

-static vec8 kABGRToY = {

-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0

-};

+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,

+                              33, 65, 13, 0, 33, 65, 13, 0};

-static vec8 kABGRToU = {

-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0

-};

+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,

+                              -38, -74, 112, 0, -38, -74, 112, 0};

-static vec8 kABGRToV = {

-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0

-};

+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,

+                              112, -94, -18, 0, 112, -94, -18, 0};

 // Constants for RGBA.

-static vec8 kRGBAToY = {

-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33

-};

+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,

+                              0, 13, 65, 33, 0, 13, 65, 33};

-static vec8 kRGBAToU = {

-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38

-};

+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,

+                              0, 112, -74, -38, 0, 112, -74, -38};

-static vec8 kRGBAToV = {

-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112

-};

+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,

+                              0, -18, -94, 112, 0, -18, -94, 112};

-static uvec8 kAddY16 = {

-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u

-};

+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,

+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};

 // 7 bit fixed point 0.5.

-static vec16 kAddYJ64 = {

-  64, 64, 64, 64, 64, 64, 64, 64

-};

+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};

-static uvec8 kAddUV128 = {

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,

+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

-static uvec16 kAddUVJ128 = {

-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u

-};

+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,

+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};

 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)

 #ifdef HAS_RGB24TOARGBROW_SSSE3

 // Shuffle table for converting RGB24 to ARGB.

-static uvec8 kShuffleMaskRGB24ToARGB = {

-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u

-};

+static const uvec8 kShuffleMaskRGB24ToARGB = {

+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};

 // Shuffle table for converting RAW to ARGB.

-static uvec8 kShuffleMaskRAWToARGB = {

-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u

-};

+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,

+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};

 // Shuffle table for converting RAW to RGB24.  First 8.

 static const uvec8 kShuffleMaskRAWToRGB24_0 = {

-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,

+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting RAW to RGB24.  Middle 8.

 static const uvec8 kShuffleMaskRAWToRGB24_1 = {

-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,

+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting RAW to RGB24.  Last 8.

 static const uvec8 kShuffleMaskRAWToRGB24_2 = {

-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,

+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting ARGB to RGB24.

-static uvec8 kShuffleMaskARGBToRGB24 = {

-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u

-};

+static const uvec8 kShuffleMaskARGBToRGB24 = {

+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting ARGB to RAW.

-static uvec8 kShuffleMaskARGBToRAW = {

-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u

-};

+static const uvec8 kShuffleMaskARGBToRAW = {

+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4

-static uvec8 kShuffleMaskARGBToRGB24_0 = {

-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u

-};

+static const uvec8 kShuffleMaskARGBToRGB24_0 = {

+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};

 // YUY2 shuf 16 Y to 32 Y.

-static const lvec8 kShuffleYUY2Y = {

-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,

-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14

-};

+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,

+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,

+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};

 // YUY2 shuf 8 UV to 16 UV.

-static const lvec8 kShuffleYUY2UV = {

-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,

-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15

-};

+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,

+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,

+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};

 // UYVY shuf 16 Y to 32 Y.

-static const lvec8 kShuffleUYVYY = {

-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,

-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15

-};

+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,

+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,

+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};

 // UYVY shuf 8 UV to 16 UV.

-static const lvec8 kShuffleUYVYUV = {

-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,

-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14

-};

+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,

+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,

+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};

 // NV21 shuf 8 VU to 16 UV.

 static const lvec8 kShuffleNV21 = {

-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

};

 #endif  // HAS_RGB24TOARGBROW_SSSE3

 #ifdef HAS_J400TOARGBROW_SSE2

-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "pslld     $0x18,%%xmm5                    \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movq      " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklwd %%xmm0,%%xmm0                   \n"

-    "punpckhwd %%xmm1,%%xmm1                   \n"

-    "por       %%xmm5,%%xmm0                   \n"

-    "por       %%xmm5,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_y),     // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)        // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"

-  );

+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "pslld     $0x18,%%xmm5                    \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movq      (%0),%%xmm0                     \n"

+      "lea       0x8(%0),%0                      \n"

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpcklwd %%xmm0,%%xmm0                   \n"

+      "punpckhwd %%xmm1,%%xmm1                   \n"

+      "por       %%xmm5,%%xmm0                   \n"

+      "por       %%xmm5,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_y),     // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm5");

 #endif  // HAS_J400TOARGBROW_SSE2

 #ifdef HAS_RGB24TOARGBROW_SSSE3

-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000

-    "pslld     $0x18,%%xmm5                    \n"

-    "movdqa    %3,%%xmm4                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"

-    "lea       " MEMLEA(0x30,0) ",%0           \n"

-    "movdqa    %%xmm3,%%xmm2                   \n"

-    "palignr   $0x8,%%xmm1,%%xmm2              \n"

-    "pshufb    %%xmm4,%%xmm2                   \n"

-    "por       %%xmm5,%%xmm2                   \n"

-    "palignr   $0xc,%%xmm0,%%xmm1              \n"

-    "pshufb    %%xmm4,%%xmm0                   \n"

-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"

-    "por       %%xmm5,%%xmm0                   \n"

-    "pshufb    %%xmm4,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "por       %%xmm5,%%xmm1                   \n"

-    "palignr   $0x4,%%xmm3,%%xmm3              \n"

-    "pshufb    %%xmm4,%%xmm3                   \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "por       %%xmm5,%%xmm3                   \n"

-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_rgb24),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)        // %2

-  : "m"(kShuffleMaskRGB24ToARGB)  // %3

-  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000

+      "pslld     $0x18,%%xmm5                    \n"

+      "movdqa    %3,%%xmm4                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm3                 \n"

+      "lea       0x30(%0),%0                     \n"

+      "movdqa    %%xmm3,%%xmm2                   \n"

+      "palignr   $0x8,%%xmm1,%%xmm2              \n"

+      "pshufb    %%xmm4,%%xmm2                   \n"

+      "por       %%xmm5,%%xmm2                   \n"

+      "palignr   $0xc,%%xmm0,%%xmm1              \n"

+      "pshufb    %%xmm4,%%xmm0                   \n"

+      "movdqu    %%xmm2,0x20(%1)                 \n"

+      "por       %%xmm5,%%xmm0                   \n"

+      "pshufb    %%xmm4,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "por       %%xmm5,%%xmm1                   \n"

+      "palignr   $0x4,%%xmm3,%%xmm3              \n"

+      "pshufb    %%xmm4,%%xmm3                   \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "por       %%xmm5,%%xmm3                   \n"

+      "movdqu    %%xmm3,0x30(%1)                 \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_rgb24),              // %0

+        "+r"(dst_argb),               // %1

+        "+r"(width)                   // %2

+      : "m"(kShuffleMaskRGB24ToARGB)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000

-    "pslld     $0x18,%%xmm5                    \n"

-    "movdqa    %3,%%xmm4                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"

-    "lea       " MEMLEA(0x30,0) ",%0           \n"

-    "movdqa    %%xmm3,%%xmm2                   \n"

-    "palignr   $0x8,%%xmm1,%%xmm2              \n"

-    "pshufb    %%xmm4,%%xmm2                   \n"

-    "por       %%xmm5,%%xmm2                   \n"

-    "palignr   $0xc,%%xmm0,%%xmm1              \n"

-    "pshufb    %%xmm4,%%xmm0                   \n"

-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"

-    "por       %%xmm5,%%xmm0                   \n"

-    "pshufb    %%xmm4,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "por       %%xmm5,%%xmm1                   \n"

-    "palignr   $0x4,%%xmm3,%%xmm3              \n"

-    "pshufb    %%xmm4,%%xmm3                   \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "por       %%xmm5,%%xmm3                   \n"

-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_raw),   // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)        // %2

-  : "m"(kShuffleMaskRAWToARGB)  // %3

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000

+      "pslld     $0x18,%%xmm5                    \n"

+      "movdqa    %3,%%xmm4                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm3                 \n"

+      "lea       0x30(%0),%0                     \n"

+      "movdqa    %%xmm3,%%xmm2                   \n"

+      "palignr   $0x8,%%xmm1,%%xmm2              \n"

+      "pshufb    %%xmm4,%%xmm2                   \n"

+      "por       %%xmm5,%%xmm2                   \n"

+      "palignr   $0xc,%%xmm0,%%xmm1              \n"

+      "pshufb    %%xmm4,%%xmm0                   \n"

+      "movdqu    %%xmm2,0x20(%1)                 \n"

+      "por       %%xmm5,%%xmm0                   \n"

+      "pshufb    %%xmm4,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "por       %%xmm5,%%xmm1                   \n"

+      "palignr   $0x4,%%xmm3,%%xmm3              \n"

+      "pshufb    %%xmm4,%%xmm3                   \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "por       %%xmm5,%%xmm3                   \n"

+      "movdqu    %%xmm3,0x30(%1)                 \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_raw),              // %0

+        "+r"(dst_argb),             // %1

+        "+r"(width)                 // %2

+      : "m"(kShuffleMaskRAWToARGB)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {

-  asm volatile (

-   "movdqa     %3,%%xmm3                       \n"

-   "movdqa     %4,%%xmm4                       \n"

-   "movdqa     %5,%%xmm5                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"

-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"

-    "lea       " MEMLEA(0x18,0) ",%0           \n"

-    "pshufb    %%xmm3,%%xmm0                   \n"

-    "pshufb    %%xmm4,%%xmm1                   \n"

-    "pshufb    %%xmm5,%%xmm2                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"

-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x18,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_raw),    // %0

-    "+r"(dst_rgb24),  // %1

-    "+r"(width)       // %2

-  : "m"(kShuffleMaskRAWToRGB24_0),  // %3

-    "m"(kShuffleMaskRAWToRGB24_1),  // %4

-    "m"(kShuffleMaskRAWToRGB24_2)   // %5

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,

+                         uint8_t* dst_rgb24,

+                         int width) {

+  asm volatile(

+      "movdqa     %3,%%xmm3                       \n"

+      "movdqa     %4,%%xmm4                       \n"

+      "movdqa     %5,%%xmm5                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x4(%0),%%xmm1                  \n"

+      "movdqu    0x8(%0),%%xmm2                  \n"

+      "lea       0x18(%0),%0                     \n"

+      "pshufb    %%xmm3,%%xmm0                   \n"

+      "pshufb    %%xmm4,%%xmm1                   \n"

+      "pshufb    %%xmm5,%%xmm2                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "movq      %%xmm1,0x8(%1)                  \n"

+      "movq      %%xmm2,0x10(%1)                 \n"

+      "lea       0x18(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_raw),                  // %0

+        "+r"(dst_rgb24),                // %1

+        "+r"(width)                     // %2

+      : "m"(kShuffleMaskRAWToRGB24_0),  // %3

+        "m"(kShuffleMaskRAWToRGB24_1),  // %4

+        "m"(kShuffleMaskRAWToRGB24_2)   // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "mov       $0x1080108,%%eax                \n"

-    "movd      %%eax,%%xmm5                    \n"

-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

-    "mov       $0x20802080,%%eax               \n"

-    "movd      %%eax,%%xmm6                    \n"

-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"

-    "pcmpeqb   %%xmm3,%%xmm3                   \n"

-    "psllw     $0xb,%%xmm3                     \n"

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "psllw     $0xa,%%xmm4                     \n"

-    "psrlw     $0x5,%%xmm4                     \n"

-    "pcmpeqb   %%xmm7,%%xmm7                   \n"

-    "psllw     $0x8,%%xmm7                     \n"

-    "sub       %0,%1                           \n"

-    "sub       %0,%1                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "pand      %%xmm3,%%xmm1                   \n"

-    "psllw     $0xb,%%xmm2                     \n"

-    "pmulhuw   %%xmm5,%%xmm1                   \n"

-    "pmulhuw   %%xmm5,%%xmm2                   \n"

-    "psllw     $0x8,%%xmm1                     \n"

-    "por       %%xmm2,%%xmm1                   \n"

-    "pand      %%xmm4,%%xmm0                   \n"

-    "pmulhuw   %%xmm6,%%xmm0                   \n"

-    "por       %%xmm7,%%xmm0                   \n"

-    "movdqa    %%xmm1,%%xmm2                   \n"

-    "punpcklbw %%xmm0,%%xmm1                   \n"

-    "punpckhbw %%xmm0,%%xmm2                   \n"

-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)

-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  :

-  : "memory", "cc", "eax", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "mov       $0x1080108,%%eax                \n"

+      "movd      %%eax,%%xmm5                    \n"

+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+      "mov       $0x20802080,%%eax               \n"

+      "movd      %%eax,%%xmm6                    \n"

+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"

+      "pcmpeqb   %%xmm3,%%xmm3                   \n"

+      "psllw     $0xb,%%xmm3                     \n"

+      "pcmpeqb   %%xmm4,%%xmm4                   \n"

+      "psllw     $0xa,%%xmm4                     \n"

+      "psrlw     $0x5,%%xmm4                     \n"

+      "pcmpeqb   %%xmm7,%%xmm7                   \n"

+      "psllw     $0x8,%%xmm7                     \n"

+      "sub       %0,%1                           \n"

+      "sub       %0,%1                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "pand      %%xmm3,%%xmm1                   \n"

+      "psllw     $0xb,%%xmm2                     \n"

+      "pmulhuw   %%xmm5,%%xmm1                   \n"

+      "pmulhuw   %%xmm5,%%xmm2                   \n"

+      "psllw     $0x8,%%xmm1                     \n"

+      "por       %%xmm2,%%xmm1                   \n"

+      "pand      %%xmm4,%%xmm0                   \n"

+      "pmulhuw   %%xmm6,%%xmm0                   \n"

+      "por       %%xmm7,%%xmm0                   \n"

+      "movdqa    %%xmm1,%%xmm2                   \n"

+      "punpcklbw %%xmm0,%%xmm1                   \n"

+      "punpckhbw %%xmm0,%%xmm2                   \n"

+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"

+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"

+      "lea       0x10(%0),%0                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",

+        "xmm6", "xmm7");

-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "mov       $0x1080108,%%eax                \n"

-    "movd      %%eax,%%xmm5                    \n"

-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

-    "mov       $0x42004200,%%eax               \n"

-    "movd      %%eax,%%xmm6                    \n"

-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"

-    "pcmpeqb   %%xmm3,%%xmm3                   \n"

-    "psllw     $0xb,%%xmm3                     \n"

-    "movdqa    %%xmm3,%%xmm4                   \n"

-    "psrlw     $0x6,%%xmm4                     \n"

-    "pcmpeqb   %%xmm7,%%xmm7                   \n"

-    "psllw     $0x8,%%xmm7                     \n"

-    "sub       %0,%1                           \n"

-    "sub       %0,%1                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "psllw     $0x1,%%xmm1                     \n"

-    "psllw     $0xb,%%xmm2                     \n"

-    "pand      %%xmm3,%%xmm1                   \n"

-    "pmulhuw   %%xmm5,%%xmm2                   \n"

-    "pmulhuw   %%xmm5,%%xmm1                   \n"

-    "psllw     $0x8,%%xmm1                     \n"

-    "por       %%xmm2,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "pand      %%xmm4,%%xmm0                   \n"

-    "psraw     $0x8,%%xmm2                     \n"

-    "pmulhuw   %%xmm6,%%xmm0                   \n"

-    "pand      %%xmm7,%%xmm2                   \n"

-    "por       %%xmm2,%%xmm0                   \n"

-    "movdqa    %%xmm1,%%xmm2                   \n"

-    "punpcklbw %%xmm0,%%xmm1                   \n"

-    "punpckhbw %%xmm0,%%xmm2                   \n"

-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)

-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  :

-  : "memory", "cc", "eax", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "mov       $0x1080108,%%eax                \n"

+      "movd      %%eax,%%xmm5                    \n"

+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+      "mov       $0x42004200,%%eax               \n"

+      "movd      %%eax,%%xmm6                    \n"

+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"

+      "pcmpeqb   %%xmm3,%%xmm3                   \n"

+      "psllw     $0xb,%%xmm3                     \n"

+      "movdqa    %%xmm3,%%xmm4                   \n"

+      "psrlw     $0x6,%%xmm4                     \n"

+      "pcmpeqb   %%xmm7,%%xmm7                   \n"

+      "psllw     $0x8,%%xmm7                     \n"

+      "sub       %0,%1                           \n"

+      "sub       %0,%1                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "psllw     $0x1,%%xmm1                     \n"

+      "psllw     $0xb,%%xmm2                     \n"

+      "pand      %%xmm3,%%xmm1                   \n"

+      "pmulhuw   %%xmm5,%%xmm2                   \n"

+      "pmulhuw   %%xmm5,%%xmm1                   \n"

+      "psllw     $0x8,%%xmm1                     \n"

+      "por       %%xmm2,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "pand      %%xmm4,%%xmm0                   \n"

+      "psraw     $0x8,%%xmm2                     \n"

+      "pmulhuw   %%xmm6,%%xmm0                   \n"

+      "pand      %%xmm7,%%xmm2                   \n"

+      "por       %%xmm2,%%xmm0                   \n"

+      "movdqa    %%xmm1,%%xmm2                   \n"

+      "punpcklbw %%xmm0,%%xmm1                   \n"

+      "punpckhbw %%xmm0,%%xmm2                   \n"

+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"

+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"

+      "lea       0x10(%0),%0                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",

+        "xmm6", "xmm7");

-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "mov       $0xf0f0f0f,%%eax                \n"

-    "movd      %%eax,%%xmm4                    \n"

-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"

-    "movdqa    %%xmm4,%%xmm5                   \n"

-    "pslld     $0x4,%%xmm5                     \n"

-    "sub       %0,%1                           \n"

-    "sub       %0,%1                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "pand      %%xmm4,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm3                   \n"

-    "psllw     $0x4,%%xmm1                     \n"

-    "psrlw     $0x4,%%xmm3                     \n"

-    "por       %%xmm1,%%xmm0                   \n"

-    "por       %%xmm3,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm2,%%xmm0                   \n"

-    "punpckhbw %%xmm2,%%xmm1                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)

-    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  :

-  : "memory", "cc", "eax", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "mov       $0xf0f0f0f,%%eax                \n"

+      "movd      %%eax,%%xmm4                    \n"

+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"

+      "movdqa    %%xmm4,%%xmm5                   \n"

+      "pslld     $0x4,%%xmm5                     \n"

+      "sub       %0,%1                           \n"

+      "sub       %0,%1                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "pand      %%xmm4,%%xmm0                   \n"

+      "pand      %%xmm5,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm2,%%xmm3                   \n"

+      "psllw     $0x4,%%xmm1                     \n"

+      "psrlw     $0x4,%%xmm3                     \n"

+      "por       %%xmm1,%%xmm0                   \n"

+      "por       %%xmm3,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpcklbw %%xmm2,%%xmm0                   \n"

+      "punpckhbw %%xmm2,%%xmm1                   \n"

+      "movdqu    %%xmm0,0x00(%1,%0,2)            \n"

+      "movdqu    %%xmm1,0x10(%1,%0,2)            \n"

+      "lea       0x10(%0),%0                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "movdqa    %3,%%xmm6                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "pshufb    %%xmm6,%%xmm0                   \n"

-    "pshufb    %%xmm6,%%xmm1                   \n"

-    "pshufb    %%xmm6,%%xmm2                   \n"

-    "pshufb    %%xmm6,%%xmm3                   \n"

-    "movdqa    %%xmm1,%%xmm4                   \n"

-    "psrldq    $0x4,%%xmm1                     \n"

-    "pslldq    $0xc,%%xmm4                     \n"

-    "movdqa    %%xmm2,%%xmm5                   \n"

-    "por       %%xmm4,%%xmm0                   \n"

-    "pslldq    $0x8,%%xmm5                     \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "por       %%xmm5,%%xmm1                   \n"

-    "psrldq    $0x8,%%xmm2                     \n"

-    "pslldq    $0x4,%%xmm3                     \n"

-    "por       %%xmm3,%%xmm2                   \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"

-    "lea       " MEMLEA(0x30,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  : "m"(kShuffleMaskARGBToRGB24)  // %3

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "movdqa    %3,%%xmm6                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "pshufb    %%xmm6,%%xmm0                   \n"

+      "pshufb    %%xmm6,%%xmm1                   \n"

+      "pshufb    %%xmm6,%%xmm2                   \n"

+      "pshufb    %%xmm6,%%xmm3                   \n"

+      "movdqa    %%xmm1,%%xmm4                   \n"

+      "psrldq    $0x4,%%xmm1                     \n"

+      "pslldq    $0xc,%%xmm4                     \n"

+      "movdqa    %%xmm2,%%xmm5                   \n"

+      "por       %%xmm4,%%xmm0                   \n"

+      "pslldq    $0x8,%%xmm5                     \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "por       %%xmm5,%%xmm1                   \n"

+      "psrldq    $0x8,%%xmm2                     \n"

+      "pslldq    $0x4,%%xmm3                     \n"

+      "por       %%xmm3,%%xmm2                   \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "movdqu    %%xmm2,0x20(%1)                 \n"

+      "lea       0x30(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src),                    // %0

+        "+r"(dst),                    // %1

+        "+r"(width)                   // %2

+      : "m"(kShuffleMaskARGBToRGB24)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "movdqa    %3,%%xmm6                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "pshufb    %%xmm6,%%xmm0                   \n"

-    "pshufb    %%xmm6,%%xmm1                   \n"

-    "pshufb    %%xmm6,%%xmm2                   \n"

-    "pshufb    %%xmm6,%%xmm3                   \n"

-    "movdqa    %%xmm1,%%xmm4                   \n"

-    "psrldq    $0x4,%%xmm1                     \n"

-    "pslldq    $0xc,%%xmm4                     \n"

-    "movdqa    %%xmm2,%%xmm5                   \n"

-    "por       %%xmm4,%%xmm0                   \n"

-    "pslldq    $0x8,%%xmm5                     \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "por       %%xmm5,%%xmm1                   \n"

-    "psrldq    $0x8,%%xmm2                     \n"

-    "pslldq    $0x4,%%xmm3                     \n"

-    "por       %%xmm3,%%xmm2                   \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"

-    "lea       " MEMLEA(0x30,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  : "m"(kShuffleMaskARGBToRAW)  // %3

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "movdqa    %3,%%xmm6                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "pshufb    %%xmm6,%%xmm0                   \n"

+      "pshufb    %%xmm6,%%xmm1                   \n"

+      "pshufb    %%xmm6,%%xmm2                   \n"

+      "pshufb    %%xmm6,%%xmm3                   \n"

+      "movdqa    %%xmm1,%%xmm4                   \n"

+      "psrldq    $0x4,%%xmm1                     \n"

+      "pslldq    $0xc,%%xmm4                     \n"

+      "movdqa    %%xmm2,%%xmm5                   \n"

+      "por       %%xmm4,%%xmm0                   \n"

+      "pslldq    $0x8,%%xmm5                     \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "por       %%xmm5,%%xmm1                   \n"

+      "psrldq    $0x8,%%xmm2                     \n"

+      "pslldq    $0x4,%%xmm3                     \n"

+      "por       %%xmm3,%%xmm2                   \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "movdqu    %%xmm2,0x20(%1)                 \n"

+      "lea       0x30(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src),                  // %0

+        "+r"(dst),                  // %1

+        "+r"(width)                 // %2

+      : "m"(kShuffleMaskARGBToRAW)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm3,%%xmm3                   \n"

-    "psrld     $0x1b,%%xmm3                    \n"

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "psrld     $0x1a,%%xmm4                    \n"

-    "pslld     $0x5,%%xmm4                     \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "pslld     $0xb,%%xmm5                     \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "pslld     $0x8,%%xmm0                     \n"

-    "psrld     $0x3,%%xmm1                     \n"

-    "psrld     $0x5,%%xmm2                     \n"

-    "psrad     $0x10,%%xmm0                    \n"

-    "pand      %%xmm3,%%xmm1                   \n"

-    "pand      %%xmm4,%%xmm2                   \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "por       %%xmm2,%%xmm1                   \n"

-    "por       %%xmm1,%%xmm0                   \n"

-    "packssdw  %%xmm0,%%xmm0                   \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+#ifdef HAS_ARGBTORGB24ROW_AVX2

+// vpermd for 12+12 to 24

+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};

+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm6                  \n"

+      "vmovdqa    %4,%%ymm7                      \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x40(%0),%%ymm2                \n"

+      "vmovdqu    0x60(%0),%%ymm3                \n"

+      "lea        0x80(%0),%0                    \n"

+      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0

+      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"

+      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"

+      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"

+      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes

+      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"

+      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"

+      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"

+      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8

+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16

+      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"

+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"

+      "vmovdqu    %%ymm1,0x20(%1)                \n"

+      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24

+      "vpermq     $0x93,%%ymm3,%%ymm3            \n"

+      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"

+      "vmovdqu    %%ymm2,0x40(%1)                \n"

+      "lea        0x60(%1),%1                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),                     // %0

+        "+r"(dst),                     // %1

+        "+r"(width)                    // %2

+      : "m"(kShuffleMaskARGBToRGB24),  // %3

+        "m"(kPermdRGB24_AVX)           // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

+#endif

-void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,

-                                const uint32 dither4, int width) {

-  asm volatile (

-    "movd       %3,%%xmm6                      \n"

-    "punpcklbw  %%xmm6,%%xmm6                  \n"

-    "movdqa     %%xmm6,%%xmm7                  \n"

-    "punpcklwd  %%xmm6,%%xmm6                  \n"

-    "punpckhwd  %%xmm7,%%xmm7                  \n"

-    "pcmpeqb    %%xmm3,%%xmm3                  \n"

-    "psrld      $0x1b,%%xmm3                   \n"

-    "pcmpeqb    %%xmm4,%%xmm4                  \n"

-    "psrld      $0x1a,%%xmm4                   \n"

-    "pslld      $0x5,%%xmm4                    \n"

-    "pcmpeqb    %%xmm5,%%xmm5                  \n"

-    "pslld      $0xb,%%xmm5                    \n"

+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI

+// Shuffle table for converting ARGBToRGB24

+static const ulvec8 kPermARGBToRGB24_0 = {

+    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,

+    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,

+    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};

+static const ulvec8 kPermARGBToRGB24_1 = {

+    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,

+    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,

+    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};

+static const ulvec8 kPermARGBToRGB24_2 = {

+    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,

+    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,

+    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu     (%0),%%xmm0                    \n"

-    "paddusb    %%xmm6,%%xmm0                  \n"

-    "movdqa     %%xmm0,%%xmm1                  \n"

-    "movdqa     %%xmm0,%%xmm2                  \n"

-    "pslld      $0x8,%%xmm0                    \n"

-    "psrld      $0x3,%%xmm1                    \n"

-    "psrld      $0x5,%%xmm2                    \n"

-    "psrad      $0x10,%%xmm0                   \n"

-    "pand       %%xmm3,%%xmm1                  \n"

-    "pand       %%xmm4,%%xmm2                  \n"

-    "pand       %%xmm5,%%xmm0                  \n"

-    "por        %%xmm2,%%xmm1                  \n"

-    "por        %%xmm1,%%xmm0                  \n"

-    "packssdw   %%xmm0,%%xmm0                  \n"

-    "lea        0x10(%0),%0                    \n"

-    "movq       %%xmm0,(%1)                    \n"

-    "lea        0x8(%1),%1                     \n"

-    "sub        $0x4,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  : "m"(dither4) // %3

-  : "memory", "cc",

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "vmovdqa    %3,%%ymm5                      \n"

+      "vmovdqa    %4,%%ymm6                      \n"

+      "vmovdqa    %5,%%ymm7                      \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x40(%0),%%ymm2                \n"

+      "vmovdqu    0x60(%0),%%ymm3                \n"

+      "lea        0x80(%0),%0                    \n"

+      "vpermt2b   %%ymm1,%%ymm5,%%ymm0           \n"

+      "vpermt2b   %%ymm2,%%ymm6,%%ymm1           \n"

+      "vpermt2b   %%ymm3,%%ymm7,%%ymm2           \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "vmovdqu    %%ymm1,0x20(%1)                \n"

+      "vmovdqu    %%ymm2,0x40(%1)                \n"

+      "lea        0x60(%1),%1                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),                // %0

+        "+r"(dst),                // %1

+        "+r"(width)               // %2

+      : "m"(kPermARGBToRGB24_0),  // %3

+        "m"(kPermARGBToRGB24_1),  // %4

+        "m"(kPermARGBToRGB24_2)   // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");

+#endif

+#ifdef HAS_ARGBTORAWROW_AVX2

+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm6                  \n"

+      "vmovdqa    %4,%%ymm7                      \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x40(%0),%%ymm2                \n"

+      "vmovdqu    0x60(%0),%%ymm3                \n"

+      "lea        0x80(%0),%0                    \n"

+      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0

+      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"

+      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"

+      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"

+      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes

+      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"

+      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"

+      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"

+      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8

+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16

+      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"

+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"

+      "vmovdqu    %%ymm1,0x20(%1)                \n"

+      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24

+      "vpermq     $0x93,%%ymm3,%%ymm3            \n"

+      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"

+      "vmovdqu    %%ymm2,0x40(%1)                \n"

+      "lea        0x60(%1),%1                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),                   // %0

+        "+r"(dst),                   // %1

+        "+r"(width)                  // %2

+      : "m"(kShuffleMaskARGBToRAW),  // %3

+        "m"(kPermdRGB24_AVX)         // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

+}

+#endif

+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm3,%%xmm3                   \n"

+      "psrld     $0x1b,%%xmm3                    \n"

+      "pcmpeqb   %%xmm4,%%xmm4                   \n"

+      "psrld     $0x1a,%%xmm4                    \n"

+      "pslld     $0x5,%%xmm4                     \n"

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "pslld     $0xb,%%xmm5                     \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "pslld     $0x8,%%xmm0                     \n"

+      "psrld     $0x3,%%xmm1                     \n"

+      "psrld     $0x5,%%xmm2                     \n"

+      "psrad     $0x10,%%xmm0                    \n"

+      "pand      %%xmm3,%%xmm1                   \n"

+      "pand      %%xmm4,%%xmm2                   \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "por       %%xmm2,%%xmm1                   \n"

+      "por       %%xmm1,%%xmm0                   \n"

+      "packssdw  %%xmm0,%%xmm0                   \n"

+      "lea       0x10(%0),%0                     \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

+}

+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,

+                                uint8_t* dst,

+                                const uint32_t dither4,

+                                int width) {

+  asm volatile(

+      "movd       %3,%%xmm6                      \n"

+      "punpcklbw  %%xmm6,%%xmm6                  \n"

+      "movdqa     %%xmm6,%%xmm7                  \n"

+      "punpcklwd  %%xmm6,%%xmm6                  \n"

+      "punpckhwd  %%xmm7,%%xmm7                  \n"

+      "pcmpeqb    %%xmm3,%%xmm3                  \n"

+      "psrld      $0x1b,%%xmm3                   \n"

+      "pcmpeqb    %%xmm4,%%xmm4                  \n"

+      "psrld      $0x1a,%%xmm4                   \n"

+      "pslld      $0x5,%%xmm4                    \n"

+      "pcmpeqb    %%xmm5,%%xmm5                  \n"

+      "pslld      $0xb,%%xmm5                    \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "paddusb    %%xmm6,%%xmm0                  \n"

+      "movdqa     %%xmm0,%%xmm1                  \n"

+      "movdqa     %%xmm0,%%xmm2                  \n"

+      "pslld      $0x8,%%xmm0                    \n"

+      "psrld      $0x3,%%xmm1                    \n"

+      "psrld      $0x5,%%xmm2                    \n"

+      "psrad      $0x10,%%xmm0                   \n"

+      "pand       %%xmm3,%%xmm1                  \n"

+      "pand       %%xmm4,%%xmm2                  \n"

+      "pand       %%xmm5,%%xmm0                  \n"

+      "por        %%xmm2,%%xmm1                  \n"

+      "por        %%xmm1,%%xmm0                  \n"

+      "packssdw   %%xmm0,%%xmm0                  \n"

+      "lea        0x10(%0),%0                    \n"

+      "movq       %%xmm0,(%1)                    \n"

+      "lea        0x8(%1),%1                     \n"

+      "sub        $0x4,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src),    // %0

+        "+r"(dst),    // %1

+        "+r"(width)   // %2

+      : "m"(dither4)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

+}

 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2

-void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,

-                                const uint32 dither4, int width) {

-  asm volatile (

-    "vbroadcastss %3,%%xmm6                    \n"

-    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"

-    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"

-    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"

-    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"

-    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"

-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

-    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"

-    "vpslld     $0x5,%%ymm4,%%ymm4             \n"

-    "vpslld     $0xb,%%ymm3,%%ymm5             \n"

+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,

+                                uint8_t* dst,

+                                const uint32_t dither4,

+                                int width) {

+  asm volatile(

+      "vbroadcastss %3,%%xmm6                    \n"

+      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"

+      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"

+      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"

+      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"

+      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"

+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"

+      "vpslld     $0x5,%%ymm4,%%ymm4             \n"

+      "vpslld     $0xb,%%ymm3,%%ymm5             \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    (%0),%%ymm0                    \n"

-    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"

-    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"

-    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"

-    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"

-    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"

-    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"

-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"

-    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "lea        0x20(%0),%0                    \n"

-    "vmovdqu    %%xmm0,(%1)                    \n"

-    "lea        0x10(%1),%1                    \n"

-    "sub        $0x8,%2                        \n"

-    "jg         1b                             \n"

-    "vzeroupper                                \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  : "m"(dither4) // %3

-  : "memory", "cc",

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"

+      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"

+      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"

+      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"

+      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"

+      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"

+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"

+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "lea        0x20(%0),%0                    \n"

+      "vmovdqu    %%xmm0,(%1)                    \n"

+      "lea        0x10(%1),%1                    \n"

+      "sub        $0x8,%2                        \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),    // %0

+        "+r"(dst),    // %1

+        "+r"(width)   // %2

+      : "m"(dither4)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2

+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm4,%%xmm4                   \n"

+      "psrld     $0x1b,%%xmm4                    \n"

+      "movdqa    %%xmm4,%%xmm5                   \n"

+      "pslld     $0x5,%%xmm5                     \n"

+      "movdqa    %%xmm4,%%xmm6                   \n"

+      "pslld     $0xa,%%xmm6                     \n"

+      "pcmpeqb   %%xmm7,%%xmm7                   \n"

+      "pslld     $0xf,%%xmm7                     \n"

-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "psrld     $0x1b,%%xmm4                    \n"

-    "movdqa    %%xmm4,%%xmm5                   \n"

-    "pslld     $0x5,%%xmm5                     \n"

-    "movdqa    %%xmm4,%%xmm6                   \n"

-    "pslld     $0xa,%%xmm6                     \n"

-    "pcmpeqb   %%xmm7,%%xmm7                   \n"

-    "pslld     $0xf,%%xmm7                     \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm3                   \n"

-    "psrad     $0x10,%%xmm0                    \n"

-    "psrld     $0x3,%%xmm1                     \n"

-    "psrld     $0x6,%%xmm2                     \n"

-    "psrld     $0x9,%%xmm3                     \n"

-    "pand      %%xmm7,%%xmm0                   \n"

-    "pand      %%xmm4,%%xmm1                   \n"

-    "pand      %%xmm5,%%xmm2                   \n"

-    "pand      %%xmm6,%%xmm3                   \n"

-    "por       %%xmm1,%%xmm0                   \n"

-    "por       %%xmm3,%%xmm2                   \n"

-    "por       %%xmm2,%%xmm0                   \n"

-    "packssdw  %%xmm0,%%xmm0                   \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  :: "memory", "cc",

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm3                   \n"

+      "psrad     $0x10,%%xmm0                    \n"

+      "psrld     $0x3,%%xmm1                     \n"

+      "psrld     $0x6,%%xmm2                     \n"

+      "psrld     $0x9,%%xmm3                     \n"

+      "pand      %%xmm7,%%xmm0                   \n"

+      "pand      %%xmm4,%%xmm1                   \n"

+      "pand      %%xmm5,%%xmm2                   \n"

+      "pand      %%xmm6,%%xmm3                   \n"

+      "por       %%xmm1,%%xmm0                   \n"

+      "por       %%xmm3,%%xmm2                   \n"

+      "por       %%xmm2,%%xmm0                   \n"

+      "packssdw  %%xmm0,%%xmm0                   \n"

+      "lea       0x10(%0),%0                     \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");

-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "psllw     $0xc,%%xmm4                     \n"

-    "movdqa    %%xmm4,%%xmm3                   \n"

-    "psrlw     $0x8,%%xmm3                     \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "pand      %%xmm3,%%xmm0                   \n"

-    "pand      %%xmm4,%%xmm1                   \n"

-    "psrlq     $0x4,%%xmm0                     \n"

-    "psrlq     $0x8,%%xmm1                     \n"

-    "por       %%xmm1,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

-  );

+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm4,%%xmm4                   \n"

+      "psllw     $0xc,%%xmm4                     \n"

+      "movdqa    %%xmm4,%%xmm3                   \n"

+      "psrlw     $0x8,%%xmm3                     \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "pand      %%xmm3,%%xmm0                   \n"

+      "pand      %%xmm4,%%xmm1                   \n"

+      "psrlq     $0x4,%%xmm0                     \n"

+      "psrlq     $0x8,%%xmm1                     \n"

+      "por       %%xmm1,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "lea       0x10(%0),%0                     \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");

 #endif  // HAS_RGB24TOARGBROW_SSSE3

+/*

+ARGBToAR30Row:

+Red Blue

+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will

+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats

+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by

+(1024+4)*16 for red.

+Alpha Green

+Alpha and Green are already in the high bits so vpand can zero out the other

+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier

+could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha

+would be a simple multiplier to shift it into position.  It wants a gap of 10

+above the green.  Green is 10 bits, so there are 6 bits in the low short.  4

+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,

+and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the

+result left 10 to position the A and G channels.

+*/

+// Shuffle table for converting RAW to RGB24.  Last 8.

+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,

+                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};

+static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,

+                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};

+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;

+static const uint32_t kMaskRB10 = 0x3ff003ff;

+static const uint32_t kMaskAG10 = 0xc000ff00;

+static const uint32_t kMulAG10 = 64 * 65536 + 1028;

+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB

+      "movd       %4,%%xmm3                     \n"  // multipler for RB

+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10

+      "movd       %6,%%xmm5                     \n"  // mask for AG

+      "movd       %7,%%xmm6                     \n"  // multipler for AG

+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"

+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"

+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"

+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"

+      "sub        %0,%1                         \n"

+      "1:                                       \n"

+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels

+      "movdqa     %%xmm0,%%xmm1                 \n"

+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0

+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0

+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10

+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10

+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10

+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10

+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10

+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels

+      "add        $0x10,%0                      \n"

+      "sub        $0x4,%2                       \n"

+      "jg         1b                            \n"

+      : "+r"(src),          // %0

+        "+r"(dst),          // %1

+        "+r"(width)         // %2

+      : "m"(kShuffleRB30),  // %3

+        "m"(kMulRB10),      // %4

+        "m"(kMaskRB10),     // %5

+        "m"(kMaskAG10),     // %6

+        "m"(kMulAG10)       // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

+}

+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB

+      "movd       %4,%%xmm3                     \n"  // multipler for RB

+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10

+      "movd       %6,%%xmm5                     \n"  // mask for AG

+      "movd       %7,%%xmm6                     \n"  // multipler for AG

+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"

+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"

+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"

+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"

+      "sub        %0,%1                         \n"

+      "1:                                       \n"

+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels

+      "movdqa     %%xmm0,%%xmm1                 \n"

+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0

+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0

+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10

+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10

+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10

+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10

+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10

+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels

+      "add        $0x10,%0                      \n"

+      "sub        $0x4,%2                       \n"

+      "jg         1b                            \n"

+      : "+r"(src),          // %0

+        "+r"(dst),          // %1

+        "+r"(width)         // %2

+      : "m"(kShuffleBR30),  // %3  reversed shuffler

+        "m"(kMulRB10),      // %4

+        "m"(kMaskRB10),     // %5

+        "m"(kMaskAG10),     // %6

+        "m"(kMulAG10)       // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

+}

+#ifdef HAS_ARGBTOAR30ROW_AVX2

+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB

+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB

+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10

+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG

+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG

+      "sub        %0,%1                          \n"

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ARGB pixels

+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0

+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0

+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10

+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10

+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10

+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10

+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10

+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels

+      "add        $0x20,%0                       \n"

+      "sub        $0x8,%2                        \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),          // %0

+        "+r"(dst),          // %1

+        "+r"(width)         // %2

+      : "m"(kShuffleRB30),  // %3

+        "m"(kMulRB10),      // %4

+        "m"(kMaskRB10),     // %5

+        "m"(kMaskAG10),     // %6

+        "m"(kMulAG10)       // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

+}

+#endif

+#ifdef HAS_ABGRTOAR30ROW_AVX2

+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB

+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB

+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10

+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG

+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG

+      "sub        %0,%1                          \n"

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ABGR pixels

+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0

+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0

+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10

+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10

+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10

+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10

+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10

+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels

+      "add        $0x20,%0                       \n"

+      "sub        $0x8,%2                        \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),          // %0

+        "+r"(dst),          // %1

+        "+r"(width)         // %2

+      : "m"(kShuffleBR30),  // %3  reversed shuffler

+        "m"(kMulRB10),      // %4

+        "m"(kMaskRB10),     // %5

+        "m"(kMaskAG10),     // %6

+        "m"(kMulAG10)       // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

+}

+#endif

 #ifdef HAS_ARGBTOYROW_SSSE3

 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.

-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "movdqa    %3,%%xmm4                       \n"

-    "movdqa    %4,%%xmm5                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm4,%%xmm3                   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "phaddw    %%xmm3,%%xmm2                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "psrlw     $0x7,%%xmm2                     \n"

-    "packuswb  %%xmm2,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  : "m"(kARGBToY),   // %3

-    "m"(kAddY16)     // %4

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movdqa    %3,%%xmm4                       \n"

+      "movdqa    %4,%%xmm5                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm4,%%xmm3                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "phaddw    %%xmm3,%%xmm2                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "psrlw     $0x7,%%xmm2                     \n"

+      "packuswb  %%xmm2,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      : "m"(kARGBToY),   // %3

+        "m"(kAddY16)     // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBTOYROW_SSSE3

@@ -771,190 +1074,190 @@

 #ifdef HAS_ARGBTOYJROW_SSSE3

 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.

 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.

-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "movdqa    %3,%%xmm4                       \n"

-    "movdqa    %4,%%xmm5                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm4,%%xmm3                   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "phaddw    %%xmm3,%%xmm2                   \n"

-    "paddw     %%xmm5,%%xmm0                   \n"

-    "paddw     %%xmm5,%%xmm2                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "psrlw     $0x7,%%xmm2                     \n"

-    "packuswb  %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  : "m"(kARGBToYJ),  // %3

-    "m"(kAddYJ64)    // %4

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movdqa    %3,%%xmm4                       \n"

+      "movdqa    %4,%%xmm5                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm4,%%xmm3                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "phaddw    %%xmm3,%%xmm2                   \n"

+      "paddw     %%xmm5,%%xmm0                   \n"

+      "paddw     %%xmm5,%%xmm2                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "psrlw     $0x7,%%xmm2                     \n"

+      "packuswb  %%xmm2,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      : "m"(kARGBToYJ),  // %3

+        "m"(kAddYJ64)    // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBTOYJROW_SSSE3

 #ifdef HAS_ARGBTOYROW_AVX2

 // vpermd for vphaddw + vpackuswb vpermd.

-static const lvec32 kPermdARGBToY_AVX = {

-  0, 4, 1, 5, 2, 6, 3, 7

-};

+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "vbroadcastf128 %3,%%ymm4                  \n"

-    "vbroadcastf128 %4,%%ymm5                  \n"

-    "vmovdqu    %5,%%ymm6                      \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"

-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"

-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

-    "lea       " MEMLEA(0x80,0) ",%0           \n"

-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.

-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"

-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"

-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"

-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.

-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.

-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  : "m"(kARGBToY),   // %3

-    "m"(kAddY16),    // %4

-    "m"(kPermdARGBToY_AVX)  // %5

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm4                  \n"

+      "vbroadcastf128 %4,%%ymm5                  \n"

+      "vmovdqu    %5,%%ymm6                      \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x40(%0),%%ymm2                \n"

+      "vmovdqu    0x60(%0),%%ymm3                \n"

+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+      "lea       0x80(%0),%0                     \n"

+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.

+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"

+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"

+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"

+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.

+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.

+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb),         // %0

+        "+r"(dst_y),            // %1

+        "+r"(width)             // %2

+      : "m"(kARGBToY),          // %3

+        "m"(kAddY16),           // %4

+        "m"(kPermdARGBToY_AVX)  // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

 #endif  // HAS_ARGBTOYROW_AVX2

 #ifdef HAS_ARGBTOYJROW_AVX2

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "vbroadcastf128 %3,%%ymm4                  \n"

-    "vbroadcastf128 %4,%%ymm5                  \n"

-    "vmovdqu    %5,%%ymm6                      \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"

-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"

-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

-    "lea       " MEMLEA(0x80,0) ",%0           \n"

-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.

-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"

-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.

-    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"

-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"

-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"

-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.

-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  : "m"(kARGBToYJ),   // %3

-    "m"(kAddYJ64),    // %4

-    "m"(kPermdARGBToY_AVX)  // %5

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm4                  \n"

+      "vbroadcastf128 %4,%%ymm5                  \n"

+      "vmovdqu    %5,%%ymm6                      \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x40(%0),%%ymm2                \n"

+      "vmovdqu    0x60(%0),%%ymm3                \n"

+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+      "lea       0x80(%0),%0                     \n"

+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.

+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"

+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.

+      "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"

+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"

+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"

+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.

+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb),         // %0

+        "+r"(dst_y),            // %1

+        "+r"(width)             // %2

+      : "m"(kARGBToYJ),         // %3

+        "m"(kAddYJ64),          // %4

+        "m"(kPermdARGBToY_AVX)  // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

 #endif  // HAS_ARGBTOYJROW_AVX2

 #ifdef HAS_ARGBTOUVROW_SSSE3

-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "movdqa    %5,%%xmm3                       \n"

-    "movdqa    %6,%%xmm4                       \n"

-    "movdqa    %7,%%xmm5                       \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm1                   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm6                   \n"

+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,

+                       int src_stride_argb,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  asm volatile(

+      "movdqa    %5,%%xmm3                       \n"

+      "movdqa    %6,%%xmm4                       \n"

+      "movdqa    %7,%%xmm5                       \n"

+      "sub       %1,%2                           \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqa    %%xmm2,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm6,%%xmm2             \n"

-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "phaddw    %%xmm2,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm1                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm1                     \n"

-    "packsswb  %%xmm1,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movlps    %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "r"((intptr_t)(src_stride_argb)), // %4

-    "m"(kARGBToV),  // %5

-    "m"(kARGBToU),  // %6

-    "m"(kAddUV128)  // %7

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm1                   \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqu    0x30(%0),%%xmm6                 \n"

+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqa    %%xmm2,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm6,%%xmm2             \n"

+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm2,%%xmm6                   \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "pmaddubsw %%xmm3,%%xmm6                   \n"

+      "phaddw    %%xmm2,%%xmm0                   \n"

+      "phaddw    %%xmm6,%%xmm1                   \n"

+      "psraw     $0x8,%%xmm0                     \n"

+      "psraw     $0x8,%%xmm1                     \n"

+      "packsswb  %%xmm1,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movlps    %%xmm0,(%1)                     \n"

+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb0),                   // %0

+        "+r"(dst_u),                       // %1

+        "+r"(dst_v),                       // %2

+        "+rm"(width)                       // %3

+      : "r"((intptr_t)(src_stride_argb)),  // %4

+        "m"(kARGBToV),                     // %5

+        "m"(kARGBToU),                     // %6

+        "m"(kAddUV128)                     // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");

 #endif  // HAS_ARGBTOUVROW_SSSE3

@@ -961,717 +1264,750 @@

 #ifdef HAS_ARGBTOUVROW_AVX2

 // vpshufb for vphaddw + vpackuswb packed to shorts.

 static const lvec8 kShufARGBToUV_AVX = {

-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,

-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15

-};

-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "vbroadcastf128 %5,%%ymm5                  \n"

-    "vbroadcastf128 %6,%%ymm6                  \n"

-    "vbroadcastf128 %7,%%ymm7                  \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"

-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"

-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0

-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)

-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)

-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)

-    "lea       " MEMLEA(0x80,0) ",%0           \n"

-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"

-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"

-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"

-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"

-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"

-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"

+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,

+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};

+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,

+                      int src_stride_argb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  asm volatile(

+      "vbroadcastf128 %5,%%ymm5                  \n"

+      "vbroadcastf128 %6,%%ymm6                  \n"

+      "vbroadcastf128 %7,%%ymm7                  \n"

+      "sub        %1,%2                          \n"

-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"

-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"

-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"

-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"

-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"

-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"

-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vpshufb    %8,%%ymm0,%%ymm0               \n"

-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x40(%0),%%ymm2                \n"

+      "vmovdqu    0x60(%0),%%ymm3                \n"

+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"

+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"

+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"

+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"

+      "lea        0x80(%0),%0                    \n"

+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"

+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"

+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"

+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"

+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"

+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"

-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"

-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x20,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "r"((intptr_t)(src_stride_argb)), // %4

-    "m"(kAddUV128),  // %5

-    "m"(kARGBToV),   // %6

-    "m"(kARGBToU),   // %7

-    "m"(kShufARGBToUV_AVX)  // %8

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"

+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"

+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"

+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"

+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"

+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"

+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vpshufb    %8,%%ymm0,%%ymm0               \n"

+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"

+      "vextractf128 $0x0,%%ymm0,(%1)             \n"

+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"

+      "lea        0x10(%1),%1                    \n"

+      "sub        $0x20,%3                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb0),                   // %0

+        "+r"(dst_u),                       // %1

+        "+r"(dst_v),                       // %2

+        "+rm"(width)                       // %3

+      : "r"((intptr_t)(src_stride_argb)),  // %4

+        "m"(kAddUV128),                    // %5

+        "m"(kARGBToV),                     // %6

+        "m"(kARGBToU),                     // %7

+        "m"(kShufARGBToUV_AVX)             // %8

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBTOUVROW_AVX2

 #ifdef HAS_ARGBTOUVJROW_AVX2

-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "vbroadcastf128 %5,%%ymm5                  \n"

-    "vbroadcastf128 %6,%%ymm6                  \n"

-    "vbroadcastf128 %7,%%ymm7                  \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"

-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"

-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0

-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)

-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)

-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)

-    "lea       " MEMLEA(0x80,0) ",%0           \n"

-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"

-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"

-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"

-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"

-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"

-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"

+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,

+                       int src_stride_argb,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  asm volatile(

+      "vbroadcastf128 %5,%%ymm5                  \n"

+      "vbroadcastf128 %6,%%ymm6                  \n"

+      "vbroadcastf128 %7,%%ymm7                  \n"

+      "sub        %1,%2                          \n"

-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"

-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"

-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"

-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"

-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"

-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"

-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"

-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vpshufb    %8,%%ymm0,%%ymm0               \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x40(%0),%%ymm2                \n"

+      "vmovdqu    0x60(%0),%%ymm3                \n"

+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"

+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"

+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"

+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"

+      "lea       0x80(%0),%0                     \n"

+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"

+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"

+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"

+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"

+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"

+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"

-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"

-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x20,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "r"((intptr_t)(src_stride_argb)), // %4

-    "m"(kAddUVJ128),  // %5

-    "m"(kARGBToVJ),  // %6

-    "m"(kARGBToUJ),  // %7

-    "m"(kShufARGBToUV_AVX)  // %8

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"

+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"

+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"

+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"

+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"

+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"

+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"

+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vpshufb    %8,%%ymm0,%%ymm0               \n"

+      "vextractf128 $0x0,%%ymm0,(%1)             \n"

+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x20,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb0),                   // %0

+        "+r"(dst_u),                       // %1

+        "+r"(dst_v),                       // %2

+        "+rm"(width)                       // %3

+      : "r"((intptr_t)(src_stride_argb)),  // %4

+        "m"(kAddUVJ128),                   // %5

+        "m"(kARGBToVJ),                    // %6

+        "m"(kARGBToUJ),                    // %7

+        "m"(kShufARGBToUV_AVX)             // %8

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBTOUVJROW_AVX2

 #ifdef HAS_ARGBTOUVJROW_SSSE3

-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                        uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "movdqa    %5,%%xmm3                       \n"

-    "movdqa    %6,%%xmm4                       \n"

-    "movdqa    %7,%%xmm5                       \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm1                   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm6                   \n"

+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,

+                        int src_stride_argb,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width) {

+  asm volatile(

+      "movdqa    %5,%%xmm3                       \n"

+      "movdqa    %6,%%xmm4                       \n"

+      "movdqa    %7,%%xmm5                       \n"

+      "sub       %1,%2                           \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqa    %%xmm2,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm6,%%xmm2             \n"

-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "phaddw    %%xmm2,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm1                   \n"

-    "paddw     %%xmm5,%%xmm0                   \n"

-    "paddw     %%xmm5,%%xmm1                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm1                     \n"

-    "packsswb  %%xmm1,%%xmm0                   \n"

-    "movlps    %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "r"((intptr_t)(src_stride_argb)), // %4

-    "m"(kARGBToVJ),  // %5

-    "m"(kARGBToUJ),  // %6

-    "m"(kAddUVJ128)  // %7

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm1                   \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqu    0x30(%0),%%xmm6                 \n"

+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqa    %%xmm2,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm6,%%xmm2             \n"

+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm2,%%xmm6                   \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "pmaddubsw %%xmm3,%%xmm6                   \n"

+      "phaddw    %%xmm2,%%xmm0                   \n"

+      "phaddw    %%xmm6,%%xmm1                   \n"

+      "paddw     %%xmm5,%%xmm0                   \n"

+      "paddw     %%xmm5,%%xmm1                   \n"

+      "psraw     $0x8,%%xmm0                     \n"

+      "psraw     $0x8,%%xmm1                     \n"

+      "packsswb  %%xmm1,%%xmm0                   \n"

+      "movlps    %%xmm0,(%1)                     \n"

+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb0),                   // %0

+        "+r"(dst_u),                       // %1

+        "+r"(dst_v),                       // %2

+        "+rm"(width)                       // %3

+      : "r"((intptr_t)(src_stride_argb)),  // %4

+        "m"(kARGBToVJ),                    // %5

+        "m"(kARGBToUJ),                    // %6

+        "m"(kAddUVJ128)                    // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");

 #endif  // HAS_ARGBTOUVJROW_SSSE3

 #ifdef HAS_ARGBTOUV444ROW_SSSE3

-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

                           int width) {

-  asm volatile (

-    "movdqa    %4,%%xmm3                       \n"

-    "movdqa    %5,%%xmm4                       \n"

-    "movdqa    %6,%%xmm5                       \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm4,%%xmm6                   \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm2                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm2                     \n"

-    "packsswb  %%xmm2,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    "pmaddubsw %%xmm3,%%xmm0                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "pmaddubsw %%xmm3,%%xmm2                   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm2                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm2                     \n"

-    "packsswb  %%xmm2,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),        // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "m"(kARGBToV),  // %4

-    "m"(kARGBToU),  // %5

-    "m"(kAddUV128)  // %6

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm6"

-  );

+  asm volatile(

+      "movdqa    %4,%%xmm3                       \n"

+      "movdqa    %5,%%xmm4                       \n"

+      "movdqa    %6,%%xmm5                       \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm6                 \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm4,%%xmm6                   \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "phaddw    %%xmm6,%%xmm2                   \n"

+      "psraw     $0x8,%%xmm0                     \n"

+      "psraw     $0x8,%%xmm2                     \n"

+      "packsswb  %%xmm2,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm6                 \n"

+      "pmaddubsw %%xmm3,%%xmm0                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "pmaddubsw %%xmm3,%%xmm2                   \n"

+      "pmaddubsw %%xmm3,%%xmm6                   \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "phaddw    %%xmm6,%%xmm2                   \n"

+      "psraw     $0x8,%%xmm0                     \n"

+      "psraw     $0x8,%%xmm2                     \n"

+      "packsswb  %%xmm2,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "movdqu    %%xmm0,0x00(%1,%2,1)            \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+rm"(width)     // %3

+      : "m"(kARGBToV),   // %4

+        "m"(kARGBToU),   // %5

+        "m"(kAddUV128)   // %6

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");

 #endif  // HAS_ARGBTOUV444ROW_SSSE3

-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {

-  asm volatile (

-    "movdqa    %4,%%xmm5                       \n"

-    "movdqa    %3,%%xmm4                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm4,%%xmm3                   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "phaddw    %%xmm3,%%xmm2                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "psrlw     $0x7,%%xmm2                     \n"

-    "packuswb  %%xmm2,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_bgra),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  : "m"(kBGRAToY),   // %3

-    "m"(kAddY16)     // %4

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movdqa    %4,%%xmm5                       \n"

+      "movdqa    %3,%%xmm4                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm4,%%xmm3                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "phaddw    %%xmm3,%%xmm2                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "psrlw     $0x7,%%xmm2                     \n"

+      "packuswb  %%xmm2,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_bgra),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      : "m"(kBGRAToY),   // %3

+        "m"(kAddY16)     // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "movdqa    %5,%%xmm3                       \n"

-    "movdqa    %6,%%xmm4                       \n"

-    "movdqa    %7,%%xmm5                       \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm1                   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm6                   \n"

+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,

+                       int src_stride_bgra,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  asm volatile(

+      "movdqa    %5,%%xmm3                       \n"

+      "movdqa    %6,%%xmm4                       \n"

+      "movdqa    %7,%%xmm5                       \n"

+      "sub       %1,%2                           \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqa    %%xmm2,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm6,%%xmm2             \n"

-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "phaddw    %%xmm2,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm1                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm1                     \n"

-    "packsswb  %%xmm1,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movlps    %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_bgra0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "r"((intptr_t)(src_stride_bgra)), // %4

-    "m"(kBGRAToV),  // %5

-    "m"(kBGRAToU),  // %6

-    "m"(kAddUV128)  // %7

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm1                   \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqu    0x30(%0),%%xmm6                 \n"

+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqa    %%xmm2,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm6,%%xmm2             \n"

+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm2,%%xmm6                   \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "pmaddubsw %%xmm3,%%xmm6                   \n"

+      "phaddw    %%xmm2,%%xmm0                   \n"

+      "phaddw    %%xmm6,%%xmm1                   \n"

+      "psraw     $0x8,%%xmm0                     \n"

+      "psraw     $0x8,%%xmm1                     \n"

+      "packsswb  %%xmm1,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movlps    %%xmm0,(%1)                     \n"

+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_bgra0),                   // %0

+        "+r"(dst_u),                       // %1

+        "+r"(dst_v),                       // %2

+        "+rm"(width)                       // %3

+      : "r"((intptr_t)(src_stride_bgra)),  // %4

+        "m"(kBGRAToV),                     // %5

+        "m"(kBGRAToU),                     // %6

+        "m"(kAddUV128)                     // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");

-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {

-  asm volatile (

-    "movdqa    %4,%%xmm5                       \n"

-    "movdqa    %3,%%xmm4                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm4,%%xmm3                   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "phaddw    %%xmm3,%%xmm2                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "psrlw     $0x7,%%xmm2                     \n"

-    "packuswb  %%xmm2,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_abgr),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  : "m"(kABGRToY),   // %3

-    "m"(kAddY16)     // %4

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movdqa    %4,%%xmm5                       \n"

+      "movdqa    %3,%%xmm4                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm4,%%xmm3                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "phaddw    %%xmm3,%%xmm2                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "psrlw     $0x7,%%xmm2                     \n"

+      "packuswb  %%xmm2,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_abgr),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      : "m"(kABGRToY),   // %3

+        "m"(kAddY16)     // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {

-  asm volatile (

-    "movdqa    %4,%%xmm5                       \n"

-    "movdqa    %3,%%xmm4                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm4,%%xmm3                   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "phaddw    %%xmm3,%%xmm2                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "psrlw     $0x7,%%xmm2                     \n"

-    "packuswb  %%xmm2,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_rgba),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  : "m"(kRGBAToY),   // %3

-    "m"(kAddY16)     // %4

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movdqa    %4,%%xmm5                       \n"

+      "movdqa    %3,%%xmm4                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm4,%%xmm3                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "phaddw    %%xmm3,%%xmm2                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "psrlw     $0x7,%%xmm2                     \n"

+      "packuswb  %%xmm2,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_rgba),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      : "m"(kRGBAToY),   // %3

+        "m"(kAddY16)     // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "movdqa    %5,%%xmm3                       \n"

-    "movdqa    %6,%%xmm4                       \n"

-    "movdqa    %7,%%xmm5                       \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm1                   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm6                   \n"

+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,

+                       int src_stride_abgr,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  asm volatile(

+      "movdqa    %5,%%xmm3                       \n"

+      "movdqa    %6,%%xmm4                       \n"

+      "movdqa    %7,%%xmm5                       \n"

+      "sub       %1,%2                           \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqa    %%xmm2,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm6,%%xmm2             \n"

-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "phaddw    %%xmm2,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm1                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm1                     \n"

-    "packsswb  %%xmm1,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movlps    %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_abgr0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "r"((intptr_t)(src_stride_abgr)), // %4

-    "m"(kABGRToV),  // %5

-    "m"(kABGRToU),  // %6

-    "m"(kAddUV128)  // %7

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm1                   \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqu    0x30(%0),%%xmm6                 \n"

+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqa    %%xmm2,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm6,%%xmm2             \n"

+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm2,%%xmm6                   \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "pmaddubsw %%xmm3,%%xmm6                   \n"

+      "phaddw    %%xmm2,%%xmm0                   \n"

+      "phaddw    %%xmm6,%%xmm1                   \n"

+      "psraw     $0x8,%%xmm0                     \n"

+      "psraw     $0x8,%%xmm1                     \n"

+      "packsswb  %%xmm1,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movlps    %%xmm0,(%1)                     \n"

+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_abgr0),                   // %0

+        "+r"(dst_u),                       // %1

+        "+r"(dst_v),                       // %2

+        "+rm"(width)                       // %3

+      : "r"((intptr_t)(src_stride_abgr)),  // %4

+        "m"(kABGRToV),                     // %5

+        "m"(kABGRToU),                     // %6

+        "m"(kAddUV128)                     // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");

-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "movdqa    %5,%%xmm3                       \n"

-    "movdqa    %6,%%xmm4                       \n"

-    "movdqa    %7,%%xmm5                       \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm1                   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"

-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7

-    "pavgb     %%xmm7,%%xmm6                   \n"

+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,

+                       int src_stride_rgba,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  asm volatile(

+      "movdqa    %5,%%xmm3                       \n"

+      "movdqa    %6,%%xmm4                       \n"

+      "movdqa    %7,%%xmm5                       \n"

+      "sub       %1,%2                           \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm0                   \n"

-    "movdqa    %%xmm2,%%xmm7                   \n"

-    "shufps    $0x88,%%xmm6,%%xmm2             \n"

-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"

-    "pavgb     %%xmm7,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm2                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "phaddw    %%xmm2,%%xmm0                   \n"

-    "phaddw    %%xmm6,%%xmm1                   \n"

-    "psraw     $0x8,%%xmm0                     \n"

-    "psraw     $0x8,%%xmm1                     \n"

-    "packsswb  %%xmm1,%%xmm0                   \n"

-    "paddb     %%xmm5,%%xmm0                   \n"

-    "movlps    %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_rgba0),       // %0

-    "+r"(dst_u),           // %1

-    "+r"(dst_v),           // %2

-    "+rm"(width)           // %3

-  : "r"((intptr_t)(src_stride_rgba)), // %4

-    "m"(kRGBAToV),  // %5

-    "m"(kRGBAToU),  // %6

-    "m"(kAddUV128)  // %7

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm1                   \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqu    0x30(%0),%%xmm6                 \n"

+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "lea       0x40(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm0                   \n"

+      "movdqa    %%xmm2,%%xmm7                   \n"

+      "shufps    $0x88,%%xmm6,%%xmm2             \n"

+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm2,%%xmm6                   \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm2                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "pmaddubsw %%xmm3,%%xmm6                   \n"

+      "phaddw    %%xmm2,%%xmm0                   \n"

+      "phaddw    %%xmm6,%%xmm1                   \n"

+      "psraw     $0x8,%%xmm0                     \n"

+      "psraw     $0x8,%%xmm1                     \n"

+      "packsswb  %%xmm1,%%xmm0                   \n"

+      "paddb     %%xmm5,%%xmm0                   \n"

+      "movlps    %%xmm0,(%1)                     \n"

+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_rgba0),                   // %0

+        "+r"(dst_u),                       // %1

+        "+r"(dst_v),                       // %2

+        "+rm"(width)                       // %3

+      : "r"((intptr_t)(src_stride_rgba)),  // %4

+        "m"(kRGBAToV),                     // %5

+        "m"(kRGBAToU),                     // %6

+        "m"(kAddUV128)                     // %7

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");

 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)

 // Read 8 UV from 444

-#define READYUV444                                                             \

-    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

-    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \

-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

+#define READYUV444                                                \

+  "movq       (%[u_buf]),%%xmm0                               \n" \

+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \

+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \

+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \

+  "movq       (%[y_buf]),%%xmm4                               \n" \

+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \

+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"

 // Read 4 UV from 422, upsample to 8 UV

-#define READYUV422                                                             \

-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \

-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

+#define READYUV422                                                \

+  "movd       (%[u_buf]),%%xmm0                               \n" \

+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \

+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \

+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \

+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \

+  "movq       (%[y_buf]),%%xmm4                               \n" \

+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \

+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"

+// Read 4 UV from 422 10 bit, upsample to 8 UV

+// TODO(fbarchard): Consider shufb to replace pack/unpack

+// TODO(fbarchard): Consider pmulhuw to replace psraw

+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.

+#define READYUV210                                                \

+  "movq       (%[u_buf]),%%xmm0                               \n" \

+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \

+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \

+  "punpcklwd  %%xmm1,%%xmm0                                   \n" \

+  "psraw      $0x2,%%xmm0                                     \n" \

+  "packuswb   %%xmm0,%%xmm0                                   \n" \

+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \

+  "movdqu     (%[y_buf]),%%xmm4                               \n" \

+  "psllw      $0x6,%%xmm4                                     \n" \

+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"

 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.

-#define READYUVA422                                                            \

-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \

-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \

-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \

-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \

-    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \

-    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"

+#define READYUVA422                                               \

+  "movd       (%[u_buf]),%%xmm0                               \n" \

+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \

+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \

+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \

+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \

+  "movq       (%[y_buf]),%%xmm4                               \n" \

+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \

+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \

+  "movq       (%[a_buf]),%%xmm5                               \n" \

+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"

-// Read 2 UV from 411, upsample to 8 UV.

-// reading 4 bytes is an msan violation.

-//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"

-//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)

-// pinsrw fails with drmemory

-//  __asm pinsrw     xmm0, [esi], 0        /* U */

-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */

-#define READYUV411_TEMP                                                        \

-    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \

-    "movd       %[temp],%%xmm0                                  \n"            \

-    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \

-    "movd       %[temp],%%xmm1                                  \n"            \

-    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \

-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \

-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

-    "punpckldq  %%xmm0,%%xmm0                                   \n"            \

-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

 // Read 4 UV from NV12, upsample to 8 UV

-#define READNV12                                                               \

-    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \

-    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \

-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \

-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

+#define READNV12                                                  \

+  "movq       (%[uv_buf]),%%xmm0                              \n" \

+  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \

+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \

+  "movq       (%[y_buf]),%%xmm4                               \n" \

+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \

+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"

 // Read 4 VU from NV21, upsample to 8 UV

-#define READNV21                                                               \

-    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \

-    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \

-    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \

-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \

-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \

-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"

+#define READNV21                                                  \

+  "movq       (%[vu_buf]),%%xmm0                              \n" \

+  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \

+  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \

+  "movq       (%[y_buf]),%%xmm4                               \n" \

+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \

+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"

 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.

-#define READYUY2                                                               \

-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \

-    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \

-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \

-    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \

-    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"

+#define READYUY2                                                  \

+  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \

+  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \

+  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \

+  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \

+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"

 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.

-#define READUYVY                                                               \

-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \

-    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \

-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \

-    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \

-    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"

+#define READUYVY                                                  \

+  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \

+  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \

+  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \

+  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \

+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"

 #if defined(__x86_64__)

-#define YUVTORGB_SETUP(yuvconstants)                                           \

-    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \

-    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \

-    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \

-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \

-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \

-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \

-    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"

+#define YUVTORGB_SETUP(yuvconstants)                              \

+  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \

+  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \

+  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \

+  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \

+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \

+  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \

+  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"

 // Convert 8 pixels: 8 UV and 8 Y

-#define YUVTORGB(yuvconstants)                                                 \

-    "movdqa     %%xmm0,%%xmm1                                   \n"            \

-    "movdqa     %%xmm0,%%xmm2                                   \n"            \

-    "movdqa     %%xmm0,%%xmm3                                   \n"            \

-    "movdqa     %%xmm11,%%xmm0                                  \n"            \

-    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \

-    "psubw      %%xmm1,%%xmm0                                   \n"            \

-    "movdqa     %%xmm12,%%xmm1                                  \n"            \

-    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \

-    "psubw      %%xmm2,%%xmm1                                   \n"            \

-    "movdqa     %%xmm13,%%xmm2                                  \n"            \

-    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \

-    "psubw      %%xmm3,%%xmm2                                   \n"            \

-    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \

-    "paddsw     %%xmm4,%%xmm0                                   \n"            \

-    "paddsw     %%xmm4,%%xmm1                                   \n"            \

-    "paddsw     %%xmm4,%%xmm2                                   \n"            \

-    "psraw      $0x6,%%xmm0                                     \n"            \

-    "psraw      $0x6,%%xmm1                                     \n"            \

-    "psraw      $0x6,%%xmm2                                     \n"            \

-    "packuswb   %%xmm0,%%xmm0                                   \n"            \

-    "packuswb   %%xmm1,%%xmm1                                   \n"            \

-    "packuswb   %%xmm2,%%xmm2                                   \n"

+#define YUVTORGB16(yuvconstants)                                  \

+  "movdqa     %%xmm0,%%xmm1                                   \n" \

+  "movdqa     %%xmm0,%%xmm2                                   \n" \

+  "movdqa     %%xmm0,%%xmm3                                   \n" \

+  "movdqa     %%xmm11,%%xmm0                                  \n" \

+  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \

+  "psubw      %%xmm1,%%xmm0                                   \n" \

+  "movdqa     %%xmm12,%%xmm1                                  \n" \

+  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \

+  "psubw      %%xmm2,%%xmm1                                   \n" \

+  "movdqa     %%xmm13,%%xmm2                                  \n" \

+  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \

+  "psubw      %%xmm3,%%xmm2                                   \n" \

+  "pmulhuw    %%xmm14,%%xmm4                                  \n" \

+  "paddsw     %%xmm4,%%xmm0                                   \n" \

+  "paddsw     %%xmm4,%%xmm1                                   \n" \

+  "paddsw     %%xmm4,%%xmm2                                   \n"

 #define YUVTORGB_REGS \

-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

 #else

 #define YUVTORGB_SETUP(yuvconstants)

 // Convert 8 pixels: 8 UV and 8 Y

-#define YUVTORGB(yuvconstants)                                                 \

-    "movdqa     %%xmm0,%%xmm1                                   \n"            \

-    "movdqa     %%xmm0,%%xmm2                                   \n"            \

-    "movdqa     %%xmm0,%%xmm3                                   \n"            \

-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \

-    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \

-    "psubw      %%xmm1,%%xmm0                                   \n"            \

-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \

-    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \

-    "psubw      %%xmm2,%%xmm1                                   \n"            \

-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \

-    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \

-    "psubw      %%xmm3,%%xmm2                                   \n"            \

-    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \

-    "paddsw     %%xmm4,%%xmm0                                   \n"            \

-    "paddsw     %%xmm4,%%xmm1                                   \n"            \

-    "paddsw     %%xmm4,%%xmm2                                   \n"            \

-    "psraw      $0x6,%%xmm0                                     \n"            \

-    "psraw      $0x6,%%xmm1                                     \n"            \

-    "psraw      $0x6,%%xmm2                                     \n"            \

-    "packuswb   %%xmm0,%%xmm0                                   \n"            \

-    "packuswb   %%xmm1,%%xmm1                                   \n"            \

-    "packuswb   %%xmm2,%%xmm2                                   \n"

+#define YUVTORGB16(yuvconstants)                                  \

+  "movdqa     %%xmm0,%%xmm1                                   \n" \

+  "movdqa     %%xmm0,%%xmm2                                   \n" \

+  "movdqa     %%xmm0,%%xmm3                                   \n" \

+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \

+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \

+  "psubw      %%xmm1,%%xmm0                                   \n" \

+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \

+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \

+  "psubw      %%xmm2,%%xmm1                                   \n" \

+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \

+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \

+  "psubw      %%xmm3,%%xmm2                                   \n" \

+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \

+  "paddsw     %%xmm4,%%xmm0                                   \n" \

+  "paddsw     %%xmm4,%%xmm1                                   \n" \

+  "paddsw     %%xmm4,%%xmm2                                   \n"

 #define YUVTORGB_REGS

 #endif

+#define YUVTORGB(yuvconstants)                                    \

+  YUVTORGB16(yuvconstants)                                        \

+  "psraw      $0x6,%%xmm0                                     \n" \

+  "psraw      $0x6,%%xmm1                                     \n" \

+  "psraw      $0x6,%%xmm2                                     \n" \

+  "packuswb   %%xmm0,%%xmm0                                   \n" \

+  "packuswb   %%xmm1,%%xmm1                                   \n" \

+  "packuswb   %%xmm2,%%xmm2                                   \n"

 // Store 8 ARGB values.

-#define STOREARGB                                                              \

-    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \

-    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \

-    "movdqa     %%xmm0,%%xmm1                                    \n"           \

-    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \

-    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \

-    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \

-    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \

-    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"

+#define STOREARGB                                                  \

+  "punpcklbw  %%xmm1,%%xmm0                                    \n" \

+  "punpcklbw  %%xmm5,%%xmm2                                    \n" \

+  "movdqa     %%xmm0,%%xmm1                                    \n" \

+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \

+  "punpckhwd  %%xmm2,%%xmm1                                    \n" \

+  "movdqu     %%xmm0,(%[dst_argb])                             \n" \

+  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \

+  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"

 // Store 8 RGBA values.

-#define STORERGBA                                                              \

-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \

-    "punpcklbw %%xmm2,%%xmm1                                     \n"           \

-    "punpcklbw %%xmm0,%%xmm5                                     \n"           \

-    "movdqa    %%xmm5,%%xmm0                                     \n"           \

-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \

-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \

-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \

-    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \

-    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"

+#define STORERGBA                                                  \

+  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \

+  "punpcklbw %%xmm2,%%xmm1                                     \n" \

+  "punpcklbw %%xmm0,%%xmm5                                     \n" \

+  "movdqa    %%xmm5,%%xmm0                                     \n" \

+  "punpcklwd %%xmm1,%%xmm5                                     \n" \

+  "punpckhwd %%xmm1,%%xmm0                                     \n" \

+  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \

+  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \

+  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"

-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,

-                                const uint8* u_buf,

-                                const uint8* v_buf,

-                                uint8* dst_argb,

+// Store 8 AR30 values.

+#define STOREAR30                                                  \

+  "psraw      $0x4,%%xmm0                                      \n" \

+  "psraw      $0x4,%%xmm1                                      \n" \

+  "psraw      $0x4,%%xmm2                                      \n" \

+  "pminsw     %%xmm7,%%xmm0                                    \n" \

+  "pminsw     %%xmm7,%%xmm1                                    \n" \

+  "pminsw     %%xmm7,%%xmm2                                    \n" \

+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \

+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \

+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \

+  "psllw      $0x4,%%xmm2                                      \n" \

+  "movdqa     %%xmm0,%%xmm3                                    \n" \

+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \

+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \

+  "movdqa     %%xmm1,%%xmm2                                    \n" \

+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \

+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \

+  "pslld      $0xa,%%xmm1                                      \n" \

+  "pslld      $0xa,%%xmm2                                      \n" \

+  "por        %%xmm1,%%xmm0                                    \n" \

+  "por        %%xmm2,%%xmm3                                    \n" \

+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \

+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \

+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"

+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_argb,

                                 const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

@@ -1678,8 +2014,9 @@

     YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUV444

     YUVTORGB(yuvconstants)

     STOREARGB

@@ -1691,15 +2028,15 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+  : "memory", "cc", YUVTORGB_REGS

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,

-                                 const uint8* u_buf,

-                                 const uint8* v_buf,

-                                 uint8* dst_rgb24,

+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,

+                                 const uint8_t* u_buf,

+                                 const uint8_t* v_buf,

+                                 uint8_t* dst_rgb24,

                                  const struct YuvConstants* yuvconstants,

                                  int width) {

   asm volatile (

@@ -1707,8 +2044,9 @@

     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"

     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"

     "sub       %[u_buf],%[v_buf]               \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUV422

     YUVTORGB(yuvconstants)

     "punpcklbw %%xmm1,%%xmm0                   \n"

@@ -1719,9 +2057,9 @@

     "pshufb    %%xmm5,%%xmm0                   \n"

     "pshufb    %%xmm6,%%xmm1                   \n"

     "palignr   $0xc,%%xmm0,%%xmm1              \n"

-    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"

-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"

-    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"

+    "movq      %%xmm0,(%[dst_rgb24])           \n"

+    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"

+    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"

     "subl      $0x8,%[width]                   \n"

     "jg        1b                              \n"

   : [y_buf]"+r"(y_buf),    // %[y_buf]

@@ -1728,7 +2066,7 @@

     [u_buf]"+r"(u_buf),    // %[u_buf]

     [v_buf]"+r"(v_buf),    // %[v_buf]

     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]

-#if defined(__i386__) && defined(__pic__)

+#if defined(__i386__)

     [width]"+m"(width)     // %[width]

 #else

     [width]"+rm"(width)    // %[width]

@@ -1736,15 +2074,15 @@

   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]

     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),

     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+  : "memory", "cc", YUVTORGB_REGS

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

);

-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,

-                                const uint8* u_buf,

-                                const uint8* v_buf,

-                                uint8* dst_argb,

+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_argb,

                                 const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

@@ -1751,8 +2089,9 @@

     YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUV422

     YUVTORGB(yuvconstants)

     STOREARGB

@@ -1764,21 +2103,160 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+  : "memory", "cc", YUVTORGB_REGS

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,

-                                const uint8* uv_buf,

-                                uint8* dst_argb,

+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_ar30,

                                 const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

     YUVTORGB_SETUP(yuvconstants)

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // AR30 constants

+    "psrlw     $14,%%xmm5                      \n"

+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits

+    "pxor      %%xmm6,%%xmm6                   \n"

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min

+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max

+    LABELALIGN

+    "1:                                        \n"

+    READYUV422

+    YUVTORGB16(yuvconstants)

+    STOREAR30

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  );

+}

+// 10 bit YUV to ARGB

+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,

+                                const uint16_t* u_buf,

+                                const uint16_t* v_buf,

+                                uint8_t* dst_argb,

+                                const struct YuvConstants* yuvconstants,

+                                int width) {

+  asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

+    "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

+    READYUV210

+    YUVTORGB(yuvconstants)

+    STOREARGB

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+// 10 bit YUV to AR30

+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,

+                                const uint16_t* u_buf,

+                                const uint16_t* v_buf,

+                                uint8_t* dst_ar30,

+                                const struct YuvConstants* yuvconstants,

+                                int width) {

+  asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

+    "sub       %[u_buf],%[v_buf]               \n"

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    "psrlw     $14,%%xmm5                      \n"

+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits

+    "pxor      %%xmm6,%%xmm6                   \n"

+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min

+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max

+    LABELALIGN

+    "1:                                        \n"

+    READYUV210

+    YUVTORGB16(yuvconstants)

+    STOREAR30

+    "sub       $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  );

+}

+#ifdef HAS_I422ALPHATOARGBROW_SSSE3

+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,

+                                     const uint8_t* u_buf,

+                                     const uint8_t* v_buf,

+                                     const uint8_t* a_buf,

+                                     uint8_t* dst_argb,

+                                     const struct YuvConstants* yuvconstants,

+                                     int width) {

+  // clang-format off

+  asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

+    "sub       %[u_buf],%[v_buf]               \n"

+    LABELALIGN

+    "1:                                        \n"

+    READYUVA422

+    YUVTORGB(yuvconstants)

+    STOREARGB

+    "subl      $0x8,%[width]                   \n"

+    "jg        1b                              \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [a_buf]"+r"(a_buf),    // %[a_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+#if defined(__i386__)

+    [width]"+m"(width)     // %[width]

+#else

+    [width]"+rm"(width)    // %[width]

+#endif

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", YUVTORGB_REGS

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+  // clang-format on

+}

+#endif  // HAS_I422ALPHATOARGBROW_SSSE3

+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,

+                                const uint8_t* uv_buf,

+                                uint8_t* dst_argb,

+                                const struct YuvConstants* yuvconstants,

+                                int width) {

+  // clang-format off

+  asm volatile (

+    YUVTORGB_SETUP(yuvconstants)

+    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+    LABELALIGN

+    "1:                                        \n"

     READNV12

     YUVTORGB(yuvconstants)

     STOREARGB

@@ -1789,21 +2267,24 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS

       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,

-                                const uint8* vu_buf,

-                                uint8* dst_argb,

+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,

+                                const uint8_t* vu_buf,

+                                uint8_t* dst_argb,

                                 const struct YuvConstants* yuvconstants,

                                 int width) {

+  // clang-format off

   asm volatile (

     YUVTORGB_SETUP(yuvconstants)

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READNV21

     YUVTORGB(yuvconstants)

     STOREARGB

@@ -1815,20 +2296,23 @@

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

     [kShuffleNV21]"m"(kShuffleNV21)

-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS

       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

-void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,

-                                uint8* dst_argb,

+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,

+                                uint8_t* dst_argb,

                                 const struct YuvConstants* yuvconstants,

                                 int width) {

+  // clang-format off

   asm volatile (

     YUVTORGB_SETUP(yuvconstants)

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUY2

     YUVTORGB(yuvconstants)

     STOREARGB

@@ -1840,20 +2324,23 @@

   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),

     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)

-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS

       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

-void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,

-                                uint8* dst_argb,

+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,

+                                uint8_t* dst_argb,

                                 const struct YuvConstants* yuvconstants,

                                 int width) {

+  // clang-format off

   asm volatile (

     YUVTORGB_SETUP(yuvconstants)

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READUYVY

     YUVTORGB(yuvconstants)

     STOREARGB

@@ -1865,15 +2352,16 @@

   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

     [kShuffleUYVYY]"m"(kShuffleUYVYY),

     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)

-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS

       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,

-                                const uint8* u_buf,

-                                const uint8* v_buf,

-                                uint8* dst_rgba,

+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,

+                                const uint8_t* u_buf,

+                                const uint8_t* v_buf,

+                                uint8_t* dst_rgba,

                                 const struct YuvConstants* yuvconstants,

                                 int width) {

   asm volatile (

@@ -1880,8 +2368,9 @@

     YUVTORGB_SETUP(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "pcmpeqb   %%xmm5,%%xmm5                   \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUV422

     YUVTORGB(yuvconstants)

     STORERGBA

@@ -1893,7 +2382,7 @@

     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS

+  : "memory", "cc", YUVTORGB_REGS

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

@@ -1901,171 +2390,202 @@

 #endif  // HAS_I422TOARGBROW_SSSE3

 // Read 16 UV from 444

-#define READYUV444_AVX2                                                        \

-    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

-    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \

-    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \

-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

+#define READYUV444_AVX2                                               \

+  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \

+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \

+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \

+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \

+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \

+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \

+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \

+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \

+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"

 // Read 8 UV from 422, upsample to 16 UV.

-#define READYUV422_AVX2                                                        \

-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \

-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

+#define READYUV422_AVX2                                               \

+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \

+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \

+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \

+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \

+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \

+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \

+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \

+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \

+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"

+// Read 8 UV from 210 10 bit, upsample to 16 UV

+// TODO(fbarchard): Consider vshufb to replace pack/unpack

+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.

+#define READYUV210_AVX2                                            \

+  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \

+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \

+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \

+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \

+  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \

+  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \

+  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \

+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \

+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \

+  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \

+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"

 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.

-#define READYUVA422_AVX2                                                       \

-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \

-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \

-    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \

-    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \

-    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"

+#define READYUVA422_AVX2                                              \

+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \

+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \

+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \

+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \

+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \

+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \

+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \

+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \

+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \

+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \

+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \

+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"

-// Read 4 UV from 411, upsample to 16 UV.

-#define READYUV411_AVX2                                                        \

-    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \

-    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \

-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \

-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

 // Read 8 UV from NV12, upsample to 16 UV.

-#define READNV12_AVX2                                                          \

-    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \

-    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \

-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \

-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

+#define READNV12_AVX2                                                 \

+  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \

+  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \

+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \

+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \

+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \

+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \

+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"

 // Read 8 VU from NV21, upsample to 16 UV.

-#define READNV21_AVX2                                                          \

-    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \

-    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \

-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \

-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \

-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \

-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \

-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"

+#define READNV21_AVX2                                                 \

+  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \

+  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \

+  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \

+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \

+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \

+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \

+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"

 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.

-#define READYUY2_AVX2                                                          \

-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \

-    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \

-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \

-    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \

-    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"

+#define READYUY2_AVX2                                                 \

+  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \

+  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \

+  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \

+  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \

+  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"

 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.

-#define READUYVY_AVX2                                                          \

-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \

-    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \

-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \

-    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \

-    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"

+#define READUYVY_AVX2                                                 \

+  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \

+  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \

+  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \

+  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \

+  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"

 #if defined(__x86_64__)

-#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \

-    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \

-    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \

-    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \

-    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \

-    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \

-    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \

-    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"

-#define YUVTORGB_AVX2(yuvconstants)                                            \

-    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \

-    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \

-    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \

-    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \

-    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \

-    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \

-    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \

-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \

-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \

-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \

-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \

-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \

-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \

-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \

-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \

-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"

+#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \

+  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \

+  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \

+  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \

+  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \

+  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \

+  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \

+  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"

+#define YUVTORGB16_AVX2(yuvconstants)                                 \

+  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \

+  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \

+  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \

+  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \

+  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \

+  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \

+  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \

+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \

+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \

+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"

 #define YUVTORGB_REGS_AVX2 \

-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",

 #else  // Convert 16 pixels: 16 UV and 16 Y.

 #define YUVTORGB_SETUP_AVX2(yuvconstants)

-#define YUVTORGB_AVX2(yuvconstants)                                            \

-    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \

-    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \

-    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \

-    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \

-    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \

-    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \

-    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \

-    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \

-    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \

-    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \

-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \

-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \

-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \

-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \

-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \

-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \

-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \

-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \

-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"

+#define YUVTORGB16_AVX2(yuvconstants)                                 \

+  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \

+  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \

+  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \

+  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \

+  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \

+  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \

+  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \

+  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \

+  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \

+  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \

+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \

+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \

+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"

 #define YUVTORGB_REGS_AVX2

 #endif

+#define YUVTORGB_AVX2(yuvconstants)                                   \

+  YUVTORGB16_AVX2(yuvconstants)                                       \

+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \

+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \

+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \

+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \

+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \

+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"

 // Store 16 ARGB values.

-#define STOREARGB_AVX2                                                         \

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \

-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \

-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \

-    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \

-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \

-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \

-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \

-    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \

-    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"

+#define STOREARGB_AVX2                                                \

+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \

+  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \

+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \

+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \

+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \

+  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \

+  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \

+  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"

+// Store 16 AR30 values.

+#define STOREAR30_AVX2                                                \

+  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \

+  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \

+  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \

+  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \

+  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \

+  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \

+  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \

+  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \

+  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \

+  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \

+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \

+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \

+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \

+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \

+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \

+  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \

+  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \

+  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \

+  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \

+  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \

+  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \

+  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \

+  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \

+  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"

 #ifdef HAS_I444TOARGBROW_AVX2

 // 16 pixels

 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).

-void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* dst_argb,

+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,

+                               const uint8_t* u_buf,

+                               const uint8_t* v_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

@@ -2072,8 +2592,9 @@

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUV444_AVX2

     YUVTORGB_AVX2(yuvconstants)

     STOREARGB_AVX2

@@ -2086,19 +2607,19 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+  : "memory", "cc", YUVTORGB_REGS_AVX2

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

 #endif  // HAS_I444TOARGBROW_AVX2

-#ifdef HAS_I411TOARGBROW_AVX2

+#if defined(HAS_I422TOARGBROW_AVX2)

 // 16 pixels

-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* dst_argb,

+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,

+                               const uint8_t* u_buf,

+                               const uint8_t* v_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

@@ -2105,13 +2626,15 @@

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     LABELALIGN

-  "1:                                          \n"

-    READYUV411_AVX2

+    "1:                                        \n"

+    READYUV422_AVX2

     YUVTORGB_AVX2(yuvconstants)

     STOREARGB_AVX2

     "sub       $0x10,%[width]                  \n"

     "jg        1b                              \n"

     "vzeroupper                                \n"

   : [y_buf]"+r"(y_buf),    // %[y_buf]

     [u_buf]"+r"(u_buf),    // %[u_buf]

@@ -2119,19 +2642,59 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+  : "memory", "cc", YUVTORGB_REGS_AVX2

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-#endif  // HAS_I411TOARGBROW_AVX2

+#endif  // HAS_I422TOARGBROW_AVX2

-#if defined(HAS_I422TOARGBROW_AVX2)

+#if defined(HAS_I422TOAR30ROW_AVX2)

 // 16 pixels

+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).

+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,

+                               const uint8_t* u_buf,

+                               const uint8_t* v_buf,

+                               uint8_t* dst_ar30,

+                               const struct YuvConstants* yuvconstants,

+                               int width) {

+  asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

+    "sub       %[u_buf],%[v_buf]               \n"

+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants

+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"

+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits

+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min

+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max

+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"

+    LABELALIGN

+    "1:                                        \n"

+    READYUV422_AVX2

+    YUVTORGB16_AVX2(yuvconstants)

+    STOREAR30_AVX2

+    "sub       $0x10,%[width]                  \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

+  );

+}

+#endif  // HAS_I422TOAR30ROW_AVX2

+#if defined(HAS_I210TOARGBROW_AVX2)

+// 16 pixels

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* dst_argb,

+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,

+                               const uint16_t* u_buf,

+                               const uint16_t* v_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

@@ -2138,13 +2701,15 @@

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

     LABELALIGN

-  "1:                                          \n"

-    READYUV422_AVX2

+    "1:                                        \n"

+    READYUV210_AVX2

     YUVTORGB_AVX2(yuvconstants)

     STOREARGB_AVX2

     "sub       $0x10,%[width]                  \n"

     "jg        1b                              \n"

     "vzeroupper                                \n"

   : [y_buf]"+r"(y_buf),    // %[y_buf]

     [u_buf]"+r"(u_buf),    // %[u_buf]

@@ -2152,19 +2717,100 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+  : "memory", "cc", YUVTORGB_REGS_AVX2

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

-#endif  // HAS_I422TOARGBROW_AVX2

+#endif  // HAS_I210TOARGBROW_AVX2

+#if defined(HAS_I210TOAR30ROW_AVX2)

+// 16 pixels

+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).

+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,

+                               const uint16_t* u_buf,

+                               const uint16_t* v_buf,

+                               uint8_t* dst_ar30,

+                               const struct YuvConstants* yuvconstants,

+                               int width) {

+  asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

+    "sub       %[u_buf],%[v_buf]               \n"

+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants

+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"

+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits

+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min

+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max

+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"

+    LABELALIGN

+    "1:                                        \n"

+    READYUV210_AVX2

+    YUVTORGB16_AVX2(yuvconstants)

+    STOREAR30_AVX2

+    "sub       $0x10,%[width]                  \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]

+    [width]"+rm"(width)    // %[width]

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+}

+#endif  // HAS_I210TOAR30ROW_AVX2

+#if defined(HAS_I422ALPHATOARGBROW_AVX2)

+// 16 pixels

+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.

+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,

+                                    const uint8_t* u_buf,

+                                    const uint8_t* v_buf,

+                                    const uint8_t* a_buf,

+                                    uint8_t* dst_argb,

+                                    const struct YuvConstants* yuvconstants,

+                                    int width) {

+  // clang-format off

+  asm volatile (

+    YUVTORGB_SETUP_AVX2(yuvconstants)

+    "sub       %[u_buf],%[v_buf]               \n"

+    LABELALIGN

+    "1:                                        \n"

+    READYUVA422_AVX2

+    YUVTORGB_AVX2(yuvconstants)

+    STOREARGB_AVX2

+    "subl      $0x10,%[width]                  \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : [y_buf]"+r"(y_buf),    // %[y_buf]

+    [u_buf]"+r"(u_buf),    // %[u_buf]

+    [v_buf]"+r"(v_buf),    // %[v_buf]

+    [a_buf]"+r"(a_buf),    // %[a_buf]

+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]

+#if defined(__i386__)

+    [width]"+m"(width)     // %[width]

+#else

+    [width]"+rm"(width)    // %[width]

+#endif

+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

+  : "memory", "cc", YUVTORGB_REGS_AVX2

+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

+  );

+  // clang-format on

+}

+#endif  // HAS_I422ALPHATOARGBROW_AVX2

 #if defined(HAS_I422TORGBAROW_AVX2)

 // 16 pixels

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).

-void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,

-                               const uint8* u_buf,

-                               const uint8* v_buf,

-                               uint8* dst_argb,

+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,

+                               const uint8_t* u_buf,

+                               const uint8_t* v_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

   asm volatile (

@@ -2171,8 +2817,9 @@

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "sub       %[u_buf],%[v_buf]               \n"

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUV422_AVX2

     YUVTORGB_AVX2(yuvconstants)

@@ -2183,11 +2830,11 @@

     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"

     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"

     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"

-    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"

-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"

-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"

-    "sub       $0x10,%[width]                  \n"

-    "jg        1b                              \n"

+    "vmovdqu    %%ymm0,(%[dst_argb])           \n"

+    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"

+    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"

+    "sub        $0x10,%[width]                 \n"

+    "jg         1b                             \n"

     "vzeroupper                                \n"

   : [y_buf]"+r"(y_buf),    // %[y_buf]

     [u_buf]"+r"(u_buf),    // %[u_buf]

@@ -2195,7 +2842,7 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2

+  : "memory", "cc", YUVTORGB_REGS_AVX2

     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

@@ -2204,16 +2851,18 @@

 #if defined(HAS_NV12TOARGBROW_AVX2)

 // 16 pixels.

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,

-                               const uint8* uv_buf,

-                               uint8* dst_argb,

+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,

+                               const uint8_t* uv_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

+  // clang-format off

   asm volatile (

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READNV12_AVX2

     YUVTORGB_AVX2(yuvconstants)

     STOREARGB_AVX2

@@ -2225,9 +2874,10 @@

     [dst_argb]"+r"(dst_argb),  // %[dst_argb]

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]

-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS_AVX2

     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

 #endif  // HAS_NV12TOARGBROW_AVX2

@@ -2234,16 +2884,18 @@

 #if defined(HAS_NV21TOARGBROW_AVX2)

 // 16 pixels.

 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,

-                               const uint8* vu_buf,

-                               uint8* dst_argb,

+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,

+                               const uint8_t* vu_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

+  // clang-format off

   asm volatile (

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READNV21_AVX2

     YUVTORGB_AVX2(yuvconstants)

     STOREARGB_AVX2

@@ -2256,9 +2908,10 @@

     [width]"+rm"(width)    // %[width]

   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

     [kShuffleNV21]"m"(kShuffleNV21)

-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS_AVX2

       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

 #endif  // HAS_NV21TOARGBROW_AVX2

@@ -2265,15 +2918,17 @@

 #if defined(HAS_YUY2TOARGBROW_AVX2)

 // 16 pixels.

 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

-void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,

-                               uint8* dst_argb,

+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

+  // clang-format off

   asm volatile (

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READYUY2_AVX2

     YUVTORGB_AVX2(yuvconstants)

     STOREARGB_AVX2

@@ -2286,9 +2941,10 @@

   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),

     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)

-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS_AVX2

       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

 #endif  // HAS_YUY2TOARGBROW_AVX2

@@ -2295,15 +2951,17 @@

 #if defined(HAS_UYVYTOARGBROW_AVX2)

 // 16 pixels.

 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

-void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,

-                               uint8* dst_argb,

+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,

+                               uint8_t* dst_argb,

                                const struct YuvConstants* yuvconstants,

                                int width) {

+  // clang-format off

   asm volatile (

     YUVTORGB_SETUP_AVX2(yuvconstants)

     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

     LABELALIGN

-  "1:                                          \n"

+    "1:                                        \n"

     READUYVY_AVX2

     YUVTORGB_AVX2(yuvconstants)

     STOREARGB_AVX2

@@ -2316,54 +2974,55 @@

   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]

     [kShuffleUYVYY]"m"(kShuffleUYVYY),

     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)

-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.

+    : "memory", "cc", YUVTORGB_REGS_AVX2

       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

);

+  // clang-format on

 #endif  // HAS_UYVYTOARGBROW_AVX2

 #ifdef HAS_I400TOARGBROW_SSE2

-void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {

-  asm volatile (

-    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164

-    "movd      %%eax,%%xmm2                    \n"

-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"

-    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16

-    "movd      %%eax,%%xmm3                    \n"

-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "pslld     $0x18,%%xmm4                    \n"

-    LABELALIGN

-  "1:                                          \n"

-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164

-    "movq      " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "pmulhuw   %%xmm2,%%xmm0                   \n"

-    "psubusw   %%xmm3,%%xmm0                   \n"

-    "psrlw     $6, %%xmm0                      \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164

+      "movd      %%eax,%%xmm2                    \n"

+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"

+      "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *

+                                                      // 16

+      "movd      %%eax,%%xmm3                    \n"

+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"

+      "pcmpeqb   %%xmm4,%%xmm4                   \n"

+      "pslld     $0x18,%%xmm4                    \n"

-    // Step 2: Weave into ARGB

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklwd %%xmm0,%%xmm0                   \n"

-    "punpckhwd %%xmm1,%%xmm1                   \n"

-    "por       %%xmm4,%%xmm0                   \n"

-    "por       %%xmm4,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

+      LABELALIGN

+      "1:                                        \n"

+      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164

+      "movq      (%0),%%xmm0                     \n"

+      "lea       0x8(%0),%0                      \n"

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "pmulhuw   %%xmm2,%%xmm0                   \n"

+      "psubusw   %%xmm3,%%xmm0                   \n"

+      "psrlw     $6, %%xmm0                      \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(y_buf),     // %0

-    "+r"(dst_argb),  // %1

-    "+rm"(width)     // %2

-  :

-  : "memory", "cc", "eax"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

-  );

+      // Step 2: Weave into ARGB

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpcklwd %%xmm0,%%xmm0                   \n"

+      "punpckhwd %%xmm1,%%xmm1                   \n"

+      "por       %%xmm4,%%xmm0                   \n"

+      "por       %%xmm4,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(y_buf),     // %0

+        "+r"(dst_argb),  // %1

+        "+rm"(width)     // %2

+      :

+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");

 #endif  // HAS_I400TOARGBROW_SSE2

@@ -2370,1077 +3029,1548 @@

 #ifdef HAS_I400TOARGBROW_AVX2

 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).

 // note: vpunpcklbw mutates and vpackuswb unmutates.

-void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {

-  asm volatile (

-    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16

-    "vmovd      %%eax,%%xmm2                   \n"

-    "vbroadcastss %%xmm2,%%ymm2                \n"

-    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164

-    "vmovd      %%eax,%%xmm3                   \n"

-    "vbroadcastss %%xmm3,%%ymm3                \n"

-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

-    "vpslld     $0x18,%%ymm4,%%ymm4            \n"

+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *

+                                                      // 16

+      "vmovd      %%eax,%%xmm2                   \n"

+      "vbroadcastss %%xmm2,%%ymm2                \n"

+      "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164

+      "vmovd      %%eax,%%xmm3                   \n"

+      "vbroadcastss %%xmm3,%%ymm3                \n"

+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+      "vpslld     $0x18,%%ymm4,%%ymm4            \n"

-    LABELALIGN

-  "1:                                          \n"

-    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164

-    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"

-    "lea        " MEMLEA(0x10,0) ",%0          \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"

-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"

-    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"

-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"

-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"

-    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"

-    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"

-    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub        $0x10,%2                       \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(y_buf),     // %0

-    "+r"(dst_argb),  // %1

-    "+rm"(width)     // %2

-  :

-  : "memory", "cc", "eax"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164

+      "vmovdqu    (%0),%%xmm0                    \n"

+      "lea        0x10(%0),%0                    \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"

+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"

+      "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"

+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"

+      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"

+      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"

+      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"

+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "vmovdqu    %%ymm1,0x20(%1)                \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub        $0x10,%2                       \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(y_buf),     // %0

+        "+r"(dst_argb),  // %1

+        "+rm"(width)     // %2

+      :

+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");

 #endif  // HAS_I400TOARGBROW_AVX2

 #ifdef HAS_MIRRORROW_SSSE3

 // Shuffle table for reversing the bytes.

-static uvec8 kShuffleMirror = {

-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

-};

+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,

+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};

-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {

   intptr_t temp_width = (intptr_t)(width);

-  asm volatile (

-    "movdqa    %3,%%xmm5                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0

-    "pshufb    %%xmm5,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(temp_width)  // %2

-  : "m"(kShuffleMirror) // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm5"

-  );

+  asm volatile(

+      "movdqa    %3,%%xmm5                       \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    -0x10(%0,%2,1),%%xmm0           \n"

+      "pshufb    %%xmm5,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src),           // %0

+        "+r"(dst),           // %1

+        "+r"(temp_width)     // %2

+      : "m"(kShuffleMirror)  // %3

+      : "memory", "cc", "xmm0", "xmm5");

 #endif  // HAS_MIRRORROW_SSSE3

 #ifdef HAS_MIRRORROW_AVX2

-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {

   intptr_t temp_width = (intptr_t)(width);

-  asm volatile (

-    "vbroadcastf128 %3,%%ymm5                  \n"

-    LABELALIGN

-  "1:                                          \n"

-    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0

-    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(temp_width)  // %2

-  : "m"(kShuffleMirror) // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm5"

-  );

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm5                  \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"

+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src),           // %0

+        "+r"(dst),           // %1

+        "+r"(temp_width)     // %2

+      : "m"(kShuffleMirror)  // %3

+      : "memory", "cc", "xmm0", "xmm5");

 #endif  // HAS_MIRRORROW_AVX2

 #ifdef HAS_MIRRORUVROW_SSSE3

 // Shuffle table for reversing the bytes of UV channels.

-static uvec8 kShuffleMirrorUV = {

-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

-};

-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,

+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,

+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};

+void MirrorUVRow_SSSE3(const uint8_t* src,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

                        int width) {

   intptr_t temp_width = (intptr_t)(width);

-  asm volatile (

-    "movdqa    %4,%%xmm1                       \n"

-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(-0x10,0) ",%0          \n"

-    "pshufb    %%xmm1,%%xmm0                   \n"

-    "movlpd    %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $8,%3                           \n"

-    "jg        1b                              \n"

-  : "+r"(src),      // %0

-    "+r"(dst_u),    // %1

-    "+r"(dst_v),    // %2

-    "+r"(temp_width)  // %3

-  : "m"(kShuffleMirrorUV)  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1"

-  );

+  asm volatile(

+      "movdqa    %4,%%xmm1                       \n"

+      "lea       -0x10(%0,%3,2),%0               \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "lea       -0x10(%0),%0                    \n"

+      "pshufb    %%xmm1,%%xmm0                   \n"

+      "movlpd    %%xmm0,(%1)                     \n"

+      "movhpd    %%xmm0,0x00(%1,%2,1)            \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $8,%3                           \n"

+      "jg        1b                              \n"

+      : "+r"(src),             // %0

+        "+r"(dst_u),           // %1

+        "+r"(dst_v),           // %2

+        "+r"(temp_width)       // %3

+      : "m"(kShuffleMirrorUV)  // %4

+      : "memory", "cc", "xmm0", "xmm1");

 #endif  // HAS_MIRRORUVROW_SSSE3

 #ifdef HAS_ARGBMIRRORROW_SSE2

-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

   intptr_t temp_width = (intptr_t)(width);

-  asm volatile (

-    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"

-    "lea       " MEMLEA(-0x10,0) ",%0          \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(temp_width)  // %2

-  :

-  : "memory", "cc"

-    , "xmm0"

-  );

+  asm volatile(

+      "lea       -0x10(%0,%2,4),%0               \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "pshufd    $0x1b,%%xmm0,%%xmm0             \n"

+      "lea       -0x10(%0),%0                    \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),        // %0

+        "+r"(dst),        // %1

+        "+r"(temp_width)  // %2

+      :

+      : "memory", "cc", "xmm0");

 #endif  // HAS_ARGBMIRRORROW_SSE2

 #ifdef HAS_ARGBMIRRORROW_AVX2

 // Shuffle table for reversing the bytes.

-static const ulvec32 kARGBShuffleMirror_AVX2 = {

-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

-};

-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};

+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {

   intptr_t temp_width = (intptr_t)(width);

-  asm volatile (

-    "vmovdqu    %3,%%ymm5                      \n"

-    LABELALIGN

-  "1:                                          \n"

-    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "lea        " MEMLEA(0x20,1) ",%1          \n"

-    "sub        $0x8,%2                        \n"

-    "jg         1b                             \n"

-    "vzeroupper                                \n"

-  : "+r"(src),  // %0

-    "+r"(dst),  // %1

-    "+r"(temp_width)  // %2

-  : "m"(kARGBShuffleMirror_AVX2) // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm5"

-  );

+  asm volatile(

+      "vmovdqu    %3,%%ymm5                      \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea        0x20(%1),%1                    \n"

+      "sub        $0x8,%2                        \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),                    // %0

+        "+r"(dst),                    // %1

+        "+r"(temp_width)              // %2

+      : "m"(kARGBShuffleMirror_AVX2)  // %3

+      : "memory", "cc", "xmm0", "xmm5");

 #endif  // HAS_ARGBMIRRORROW_AVX2

 #ifdef HAS_SPLITUVROW_AVX2

-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_AVX2(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

                      int width) {

-  asm volatile (

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"

-    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"

-    "sub        %1,%2                            \n"

-    LABELALIGN

-  "1:                                            \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"

-    "lea        " MEMLEA(0x40,0) ",%0            \n"

-    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"

-    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"

-    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"

-    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"

-    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"

-    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"

-    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"

-    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)

-    "lea        " MEMLEA(0x20,1) ",%1            \n"

-    "sub        $0x20,%3                         \n"

-    "jg         1b                               \n"

-    "vzeroupper                                  \n"

-  : "+r"(src_uv),     // %0

-    "+r"(dst_u),      // %1

-    "+r"(dst_v),      // %2

-    "+r"(width)         // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+  asm volatile(

+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"

+      "sub        %1,%2                          \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "lea        0x40(%0),%0                    \n"

+      "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"

+      "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"

+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vpermq     $0xd8,%%ymm2,%%ymm2            \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"

+      "lea        0x20(%1),%1                    \n"

+      "sub        $0x20,%3                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_uv),  // %0

+        "+r"(dst_u),   // %1

+        "+r"(dst_v),   // %2

+        "+r"(width)    // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_SPLITUVROW_AVX2

 #ifdef HAS_SPLITUVROW_SSE2

-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_SSE2(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

                      int width) {

-  asm volatile (

-    "pcmpeqb    %%xmm5,%%xmm5                    \n"

-    "psrlw      $0x8,%%xmm5                      \n"

-    "sub        %1,%2                            \n"

-    LABELALIGN

-  "1:                                            \n"

-    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"

-    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"

-    "lea        " MEMLEA(0x20,0) ",%0            \n"

-    "movdqa     %%xmm0,%%xmm2                    \n"

-    "movdqa     %%xmm1,%%xmm3                    \n"

-    "pand       %%xmm5,%%xmm0                    \n"

-    "pand       %%xmm5,%%xmm1                    \n"

-    "packuswb   %%xmm1,%%xmm0                    \n"

-    "psrlw      $0x8,%%xmm2                      \n"

-    "psrlw      $0x8,%%xmm3                      \n"

-    "packuswb   %%xmm3,%%xmm2                    \n"

-    "movdqu     %%xmm0," MEMACCESS(1) "          \n"

-    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)

-    "lea        " MEMLEA(0x10,1) ",%1            \n"

-    "sub        $0x10,%3                         \n"

-    "jg         1b                               \n"

-  : "+r"(src_uv),     // %0

-    "+r"(dst_u),      // %1

-    "+r"(dst_v),      // %2

-    "+r"(width)         // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+  asm volatile(

+      "pcmpeqb    %%xmm5,%%xmm5                  \n"

+      "psrlw      $0x8,%%xmm5                    \n"

+      "sub        %1,%2                          \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "movdqu     0x10(%0),%%xmm1                \n"

+      "lea        0x20(%0),%0                    \n"

+      "movdqa     %%xmm0,%%xmm2                  \n"

+      "movdqa     %%xmm1,%%xmm3                  \n"

+      "pand       %%xmm5,%%xmm0                  \n"

+      "pand       %%xmm5,%%xmm1                  \n"

+      "packuswb   %%xmm1,%%xmm0                  \n"

+      "psrlw      $0x8,%%xmm2                    \n"

+      "psrlw      $0x8,%%xmm3                    \n"

+      "packuswb   %%xmm3,%%xmm2                  \n"

+      "movdqu     %%xmm0,(%1)                    \n"

+      "movdqu    %%xmm2,0x00(%1,%2,1)            \n"

+      "lea        0x10(%1),%1                    \n"

+      "sub        $0x10,%3                       \n"

+      "jg         1b                             \n"

+      : "+r"(src_uv),  // %0

+        "+r"(dst_u),   // %1

+        "+r"(dst_v),   // %2

+        "+r"(width)    // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_SPLITUVROW_SSE2

 #ifdef HAS_MERGEUVROW_AVX2

-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_AVX2(const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_uv,

                      int width) {

-  asm volatile (

-    "sub       %0,%1                             \n"

-    LABELALIGN

-  "1:                                            \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"

-    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1

-    "lea       " MEMLEA(0x20,0) ",%0             \n"

-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"

-    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"

-    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"

-    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"

-    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"

-    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"

-    "lea       " MEMLEA(0x40,2) ",%2             \n"

-    "sub       $0x20,%3                          \n"

-    "jg        1b                                \n"

-    "vzeroupper                                  \n"

-  : "+r"(src_u),     // %0

-    "+r"(src_v),     // %1

-    "+r"(dst_uv),    // %2

-    "+r"(width)      // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2"

-  );

+  asm volatile(

+      "sub       %0,%1                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"

+      "lea       0x20(%0),%0                     \n"

+      "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"

+      "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"

+      "vextractf128 $0x0,%%ymm2,(%2)             \n"

+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"

+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"

+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"

+      "lea       0x40(%2),%2                     \n"

+      "sub       $0x20,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_u),   // %0

+        "+r"(src_v),   // %1

+        "+r"(dst_uv),  // %2

+        "+r"(width)    // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

 #endif  // HAS_MERGEUVROW_AVX2

 #ifdef HAS_MERGEUVROW_SSE2

-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_SSE2(const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_uv,

                      int width) {

-  asm volatile (

-    "sub       %0,%1                             \n"

-    LABELALIGN

-  "1:                                            \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"

-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1

-    "lea       " MEMLEA(0x10,0) ",%0             \n"

-    "movdqa    %%xmm0,%%xmm2                     \n"

-    "punpcklbw %%xmm1,%%xmm0                     \n"

-    "punpckhbw %%xmm1,%%xmm2                     \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "           \n"

-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"

-    "lea       " MEMLEA(0x20,2) ",%2             \n"

-    "sub       $0x10,%3                          \n"

-    "jg        1b                                \n"

-  : "+r"(src_u),     // %0

-    "+r"(src_v),     // %1

-    "+r"(dst_uv),    // %2

-    "+r"(width)      // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2"

-  );

+  asm volatile(

+      "sub       %0,%1                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "punpcklbw %%xmm1,%%xmm0                   \n"

+      "punpckhbw %%xmm1,%%xmm2                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "movdqu    %%xmm2,0x10(%2)                 \n"

+      "lea       0x20(%2),%2                     \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_u),   // %0

+        "+r"(src_v),   // %1

+        "+r"(dst_uv),  // %2

+        "+r"(width)    // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

 #endif  // HAS_MERGEUVROW_SSE2

-#ifdef HAS_COPYROW_SSE2

-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {

+// Use scale to convert lsb formats to msb, depending how many bits there are:

+// 128 = 9 bits

+// 64 = 10 bits

+// 16 = 12 bits

+// 1 = 16 bits

+#ifdef HAS_MERGEUVROW_16_AVX2

+void MergeUVRow_16_AVX2(const uint16_t* src_u,

+                        const uint16_t* src_v,

+                        uint16_t* dst_uv,

+                        int scale,

+                        int width) {

+  // clang-format off

   asm volatile (

-    "test       $0xf,%0                        \n"

-    "jne        2f                             \n"

-    "test       $0xf,%1                        \n"

-    "jne        2f                             \n"

+    "vmovd      %4,%%xmm3                      \n"

+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"

+    "vbroadcastss %%xmm3,%%ymm3                \n"

+    "sub       %0,%1                           \n"

+    // 16 pixels per loop.

     LABELALIGN

-  "1:                                          \n"

-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "1:                                        \n"

+    "vmovdqu   (%0),%%ymm0                     \n"

+    "vmovdqu   (%0,%1,1),%%ymm1                \n"

+    "add        $0x20,%0                       \n"

+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"

+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"

+    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates

+    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"

+    "vextractf128 $0x0,%%ymm2,(%2)             \n"

+    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"

+    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"

+    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"

+    "add       $0x40,%2                        \n"

+    "sub       $0x10,%3                        \n"

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src_u),   // %0

+    "+r"(src_v),   // %1

+    "+r"(dst_uv),  // %2

+    "+r"(width)    // %3

+  : "r"(scale)     // %4

+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");

+  // clang-format on

+}

+#endif  // HAS_MERGEUVROW_AVX2

+// Use scale to convert lsb formats to msb, depending how many bits there are:

+// 128 = 9 bits

+// 64 = 10 bits

+// 16 = 12 bits

+// 1 = 16 bits

+#ifdef HAS_MULTIPLYROW_16_AVX2

+void MultiplyRow_16_AVX2(const uint16_t* src_y,

+                         uint16_t* dst_y,

+                         int scale,

+                         int width) {

+  // clang-format off

+  asm volatile (

+    "vmovd      %3,%%xmm3                      \n"

+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"

+    "vbroadcastss %%xmm3,%%ymm3                \n"

+    "sub       %0,%1                           \n"

+    // 16 pixels per loop.

+    LABELALIGN

+    "1:                                        \n"

+    "vmovdqu   (%0),%%ymm0                     \n"

+    "vmovdqu   0x20(%0),%%ymm1                 \n"

+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"

+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"

+    "vmovdqu   %%ymm0,(%0,%1)                  \n"

+    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"

+    "add        $0x40,%0                       \n"

     "sub       $0x20,%2                        \n"

     "jg        1b                              \n"

-    "jmp       9f                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src_y),   // %0

+    "+r"(dst_y),   // %1

+    "+r"(width)    // %2

+  : "r"(scale)     // %3

+  : "memory", "cc", "xmm0", "xmm1", "xmm3");

+  // clang-format on

+}

+#endif  // HAS_MULTIPLYROW_16_AVX2

+// Use scale to convert lsb formats to msb, depending how many bits there are:

+// 32768 = 9 bits

+// 16384 = 10 bits

+// 4096 = 12 bits

+// 256 = 16 bits

+void Convert16To8Row_SSSE3(const uint16_t* src_y,

+                           uint8_t* dst_y,

+                           int scale,

+                           int width) {

+  // clang-format off

+  asm volatile (

+    "movd      %3,%%xmm2                      \n"

+    "punpcklwd %%xmm2,%%xmm2                  \n"

+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"

+    // 32 pixels per loop.

     LABELALIGN

-  "2:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

+    "1:                                       \n"

+    "movdqu    (%0),%%xmm0                    \n"

+    "movdqu    0x10(%0),%%xmm1                \n"

+    "add       $0x20,%0                       \n"

+    "pmulhuw   %%xmm2,%%xmm0                  \n"

+    "pmulhuw   %%xmm2,%%xmm1                  \n"

+    "packuswb  %%xmm1,%%xmm0                  \n"

+    "movdqu    %%xmm0,(%1)                    \n"

+    "add       $0x10,%1                       \n"

+    "sub       $0x10,%2                       \n"

+    "jg        1b                             \n"

+  : "+r"(src_y),   // %0

+    "+r"(dst_y),   // %1

+    "+r"(width)    // %2

+  : "r"(scale)     // %3

+  : "memory", "cc", "xmm0", "xmm1", "xmm2");

+  // clang-format on

+}

+#ifdef HAS_CONVERT16TO8ROW_AVX2

+void Convert16To8Row_AVX2(const uint16_t* src_y,

+                          uint8_t* dst_y,

+                          int scale,

+                          int width) {

+  // clang-format off

+  asm volatile (

+    "vmovd      %3,%%xmm2                      \n"

+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"

+    "vbroadcastss %%xmm2,%%ymm2                \n"

+    // 32 pixels per loop.

+    LABELALIGN

+    "1:                                        \n"

+    "vmovdqu   (%0),%%ymm0                     \n"

+    "vmovdqu   0x20(%0),%%ymm1                 \n"

+    "add       $0x40,%0                        \n"

+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"

+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"

+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates

+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+    "vmovdqu   %%ymm0,(%1)                     \n"

+    "add       $0x20,%1                        \n"

     "sub       $0x20,%2                        \n"

-    "jg        2b                              \n"

-  "9:                                          \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(count)  // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1"

-  );

+    "jg        1b                              \n"

+    "vzeroupper                                \n"

+  : "+r"(src_y),   // %0

+    "+r"(dst_y),   // %1

+    "+r"(width)    // %2

+  : "r"(scale)     // %3

+  : "memory", "cc", "xmm0", "xmm1", "xmm2");

+  // clang-format on

-#endif  // HAS_COPYROW_SSE2

+#endif  // HAS_CONVERT16TO8ROW_AVX2

-#ifdef HAS_COPYROW_AVX

-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {

+// Use scale to convert to lsb formats depending how many bits there are:

+// 512 = 9 bits

+// 1024 = 10 bits

+// 4096 = 12 bits

+// TODO(fbarchard): reduce to SSE2

+void Convert8To16Row_SSE2(const uint8_t* src_y,

+                          uint16_t* dst_y,

+                          int scale,

+                          int width) {

+  // clang-format off

   asm volatile (

+    "movd      %3,%%xmm2                      \n"

+    "punpcklwd %%xmm2,%%xmm2                  \n"

+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"

+    // 32 pixels per loop.

     LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"

-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x40,%2                        \n"

+    "1:                                       \n"

+    "movdqu    (%0),%%xmm0                    \n"

+    "movdqa    %%xmm0,%%xmm1                  \n"

+    "punpcklbw %%xmm0,%%xmm0                  \n"

+    "punpckhbw %%xmm1,%%xmm1                  \n"

+    "add       $0x10,%0                       \n"

+    "pmulhuw   %%xmm2,%%xmm0                  \n"

+    "pmulhuw   %%xmm2,%%xmm1                  \n"

+    "movdqu    %%xmm0,(%1)                    \n"

+    "movdqu    %%xmm1,0x10(%1)                \n"

+    "add       $0x20,%1                       \n"

+    "sub       $0x10,%2                       \n"

+    "jg        1b                             \n"

+  : "+r"(src_y),   // %0

+    "+r"(dst_y),   // %1

+    "+r"(width)    // %2

+  : "r"(scale)     // %3

+  : "memory", "cc", "xmm0", "xmm1", "xmm2");

+  // clang-format on

+}

+#ifdef HAS_CONVERT8TO16ROW_AVX2

+void Convert8To16Row_AVX2(const uint8_t* src_y,

+                          uint16_t* dst_y,

+                          int scale,

+                          int width) {

+  // clang-format off

+  asm volatile (

+    "vmovd      %3,%%xmm2                      \n"

+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"

+    "vbroadcastss %%xmm2,%%ymm2                \n"

+    // 32 pixels per loop.

+    LABELALIGN

+    "1:                                        \n"

+    "vmovdqu   (%0),%%ymm0                     \n"

+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+    "add       $0x20,%0                        \n"

+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm1           \n"

+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"

+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"

+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"

+    "vmovdqu   %%ymm0,(%1)                     \n"

+    "vmovdqu   %%ymm1,0x20(%1)                 \n"

+    "add       $0x40,%1                        \n"

+    "sub       $0x20,%2                        \n"

     "jg        1b                              \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(count)  // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1"

-  );

+    "vzeroupper                                \n"

+  : "+r"(src_y),   // %0

+    "+r"(dst_y),   // %1

+    "+r"(width)    // %2

+  : "r"(scale)     // %3

+  : "memory", "cc", "xmm0", "xmm1", "xmm2");

+  // clang-format on

+#endif  // HAS_CONVERT8TO16ROW_AVX2

+#ifdef HAS_SPLITRGBROW_SSSE3

+// Shuffle table for converting RGB to Planar.

+static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,

+                                          128u, 128u, 128u, 128u, 128u, 128u,

+                                          128u, 128u, 128u, 128u};

+static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,

+                                          2u,   5u,   8u,   11u,  14u,  128u,

+                                          128u, 128u, 128u, 128u};

+static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,

+                                          128u, 128u, 128u, 128u, 128u, 1u,

+                                          4u,   7u,   10u,  13u};

+static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,

+                                          128u, 128u, 128u, 128u, 128u, 128u,

+                                          128u, 128u, 128u, 128u};

+static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,

+                                          3u,   6u,   9u,   12u,  15u,  128u,

+                                          128u, 128u, 128u, 128u};

+static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,

+                                          128u, 128u, 128u, 128u, 128u, 2u,

+                                          5u,   8u,   11u,  14u};

+static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,

+                                          128u, 128u, 128u, 128u, 128u, 128u,

+                                          128u, 128u, 128u, 128u};

+static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,

+                                          4u,   7u,   10u,  13u,  128u, 128u,

+                                          128u, 128u, 128u, 128u};

+static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,

+                                          128u, 128u, 128u, 128u, 0u,   3u,

+                                          6u,   9u,   12u,  15u};

+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,

+                       uint8_t* dst_r,

+                       uint8_t* dst_g,

+                       uint8_t* dst_b,

+                       int width) {

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "movdqu     0x10(%0),%%xmm1                \n"

+      "movdqu     0x20(%0),%%xmm2                \n"

+      "pshufb     %5, %%xmm0                     \n"

+      "pshufb     %6, %%xmm1                     \n"

+      "pshufb     %7, %%xmm2                     \n"

+      "por        %%xmm1,%%xmm0                  \n"

+      "por        %%xmm2,%%xmm0                  \n"

+      "movdqu     %%xmm0,(%1)                    \n"

+      "lea        0x10(%1),%1                    \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "movdqu     0x10(%0),%%xmm1                \n"

+      "movdqu     0x20(%0),%%xmm2                \n"

+      "pshufb     %8, %%xmm0                     \n"

+      "pshufb     %9, %%xmm1                     \n"

+      "pshufb     %10, %%xmm2                    \n"

+      "por        %%xmm1,%%xmm0                  \n"

+      "por        %%xmm2,%%xmm0                  \n"

+      "movdqu     %%xmm0,(%2)                    \n"

+      "lea        0x10(%2),%2                    \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "movdqu     0x10(%0),%%xmm1                \n"

+      "movdqu     0x20(%0),%%xmm2                \n"

+      "pshufb     %11, %%xmm0                    \n"

+      "pshufb     %12, %%xmm1                    \n"

+      "pshufb     %13, %%xmm2                    \n"

+      "por        %%xmm1,%%xmm0                  \n"

+      "por        %%xmm2,%%xmm0                  \n"

+      "movdqu     %%xmm0,(%3)                    \n"

+      "lea        0x10(%3),%3                    \n"

+      "lea        0x30(%0),%0                    \n"

+      "sub        $0x10,%4                       \n"

+      "jg         1b                             \n"

+      : "+r"(src_rgb),             // %0

+        "+r"(dst_r),               // %1

+        "+r"(dst_g),               // %2

+        "+r"(dst_b),               // %3

+        "+r"(width)                // %4

+      : "m"(kShuffleMaskRGBToR0),  // %5

+        "m"(kShuffleMaskRGBToR1),  // %6

+        "m"(kShuffleMaskRGBToR2),  // %7

+        "m"(kShuffleMaskRGBToG0),  // %8

+        "m"(kShuffleMaskRGBToG1),  // %9

+        "m"(kShuffleMaskRGBToG2),  // %10

+        "m"(kShuffleMaskRGBToB0),  // %11

+        "m"(kShuffleMaskRGBToB1),  // %12

+        "m"(kShuffleMaskRGBToB2)   // %13

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

+}

+#endif  // HAS_SPLITRGBROW_SSSE3

+#ifdef HAS_MERGERGBROW_SSSE3

+// Shuffle table for converting RGB to Planar.

+static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,

+                                          2u, 128u, 128u, 3u, 128u, 128u,

+                                          4u, 128u, 128u, 5u};

+static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,

+                                          128u, 2u, 128u, 128u, 3u, 128u,

+                                          128u, 4u, 128u, 128u};

+static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,

+                                          128u, 128u, 2u, 128u, 128u, 3u,

+                                          128u, 128u, 4u, 128u};

+static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,

+                                          7u, 128u, 128u, 8u, 128u, 128u,

+                                          9u, 128u, 128u, 10u};

+static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,

+                                          128u, 7u, 128u, 128u, 8u, 128u,

+                                          128u, 9u, 128u, 128u};

+static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,

+                                          128u, 128u, 8u,  128u, 128u, 9u,

+                                          128u, 128u, 10u, 128u};

+static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,

+                                          12u, 128u, 128u, 13u, 128u, 128u,

+                                          14u, 128u, 128u, 15u};

+static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,

+                                          128u, 13u, 128u, 128u, 14u, 128u,

+                                          128u, 15u, 128u, 128u};

+static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,

+                                          128u, 128u, 13u, 128u, 128u, 14u,

+                                          128u, 128u, 15u, 128u};

+void MergeRGBRow_SSSE3(const uint8_t* src_r,

+                       const uint8_t* src_g,

+                       const uint8_t* src_b,

+                       uint8_t* dst_rgb,

+                       int width) {

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "movdqu     (%1),%%xmm1                    \n"

+      "movdqu     (%2),%%xmm2                    \n"

+      "pshufb     %5, %%xmm0                     \n"

+      "pshufb     %6, %%xmm1                     \n"

+      "pshufb     %7, %%xmm2                     \n"

+      "por        %%xmm1,%%xmm0                  \n"

+      "por        %%xmm2,%%xmm0                  \n"

+      "movdqu     %%xmm0,(%3)                    \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "movdqu     (%1),%%xmm1                    \n"

+      "movdqu     (%2),%%xmm2                    \n"

+      "pshufb     %8, %%xmm0                     \n"

+      "pshufb     %9, %%xmm1                     \n"

+      "pshufb     %10, %%xmm2                    \n"

+      "por        %%xmm1,%%xmm0                  \n"

+      "por        %%xmm2,%%xmm0                  \n"

+      "movdqu     %%xmm0,16(%3)                  \n"

+      "movdqu     (%0),%%xmm0                    \n"

+      "movdqu     (%1),%%xmm1                    \n"

+      "movdqu     (%2),%%xmm2                    \n"

+      "pshufb     %11, %%xmm0                    \n"

+      "pshufb     %12, %%xmm1                    \n"

+      "pshufb     %13, %%xmm2                    \n"

+      "por        %%xmm1,%%xmm0                  \n"

+      "por        %%xmm2,%%xmm0                  \n"

+      "movdqu     %%xmm0,32(%3)                  \n"

+      "lea        0x10(%0),%0                    \n"

+      "lea        0x10(%1),%1                    \n"

+      "lea        0x10(%2),%2                    \n"

+      "lea        0x30(%3),%3                    \n"

+      "sub        $0x10,%4                       \n"

+      "jg         1b                             \n"

+      : "+r"(src_r),               // %0

+        "+r"(src_g),               // %1

+        "+r"(src_b),               // %2

+        "+r"(dst_rgb),             // %3

+        "+r"(width)                // %4

+      : "m"(kShuffleMaskRToRGB0),  // %5

+        "m"(kShuffleMaskGToRGB0),  // %6

+        "m"(kShuffleMaskBToRGB0),  // %7

+        "m"(kShuffleMaskRToRGB1),  // %8

+        "m"(kShuffleMaskGToRGB1),  // %9

+        "m"(kShuffleMaskBToRGB1),  // %10

+        "m"(kShuffleMaskRToRGB2),  // %11

+        "m"(kShuffleMaskGToRGB2),  // %12

+        "m"(kShuffleMaskBToRGB2)   // %13

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

+}

+#endif  // HAS_MERGERGBROW_SSSE3

+#ifdef HAS_COPYROW_SSE2

+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "test       $0xf,%0                        \n"

+      "jne        2f                             \n"

+      "test       $0xf,%1                        \n"

+      "jne        2f                             \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqa    (%0),%%xmm0                     \n"

+      "movdqa    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "movdqa    %%xmm0,(%1)                     \n"

+      "movdqa    %%xmm1,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      "jmp       9f                              \n"

+      LABELALIGN

+      "2:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        2b                              \n"

+      LABELALIGN "9:                             \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1");

+}

+#endif  // HAS_COPYROW_SSE2

+#ifdef HAS_COPYROW_AVX

+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "vmovdqu   %%ymm0,(%1)                     \n"

+      "vmovdqu   %%ymm1,0x20(%1)                 \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x40,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1");

+}

 #endif  // HAS_COPYROW_AVX

 #ifdef HAS_COPYROW_ERMS

 // Multiple of 1.

-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {

+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {

   size_t width_tmp = (size_t)(width);

-  asm volatile (

-    "rep movsb " MEMMOVESTRING(0,1) "          \n"

-  : "+S"(src),  // %0

-    "+D"(dst),  // %1

-    "+c"(width_tmp) // %2

-  :

-  : "memory", "cc"

-  );

+  asm volatile(

+      "rep movsb                      \n"

+      : "+S"(src),       // %0

+        "+D"(dst),       // %1

+        "+c"(width_tmp)  // %2

+      :

+      : "memory", "cc");

 #endif  // HAS_COPYROW_ERMS

 #ifdef HAS_ARGBCOPYALPHAROW_SSE2

 // width in pixels

-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm0,%%xmm0                   \n"

-    "pslld     $0x18,%%xmm0                    \n"

-    "pcmpeqb   %%xmm1,%%xmm1                   \n"

-    "psrld     $0x8,%%xmm1                     \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"

-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"

-    "pand      %%xmm0,%%xmm2                   \n"

-    "pand      %%xmm0,%%xmm3                   \n"

-    "pand      %%xmm1,%%xmm4                   \n"

-    "pand      %%xmm1,%%xmm5                   \n"

-    "por       %%xmm4,%%xmm2                   \n"

-    "por       %%xmm5,%%xmm3                   \n"

-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm0,%%xmm0                   \n"

+      "pslld     $0x18,%%xmm0                    \n"

+      "pcmpeqb   %%xmm1,%%xmm1                   \n"

+      "psrld     $0x8,%%xmm1                     \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm2                     \n"

+      "movdqu    0x10(%0),%%xmm3                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "movdqu    (%1),%%xmm4                     \n"

+      "movdqu    0x10(%1),%%xmm5                 \n"

+      "pand      %%xmm0,%%xmm2                   \n"

+      "pand      %%xmm0,%%xmm3                   \n"

+      "pand      %%xmm1,%%xmm4                   \n"

+      "pand      %%xmm1,%%xmm5                   \n"

+      "por       %%xmm4,%%xmm2                   \n"

+      "por       %%xmm5,%%xmm3                   \n"

+      "movdqu    %%xmm2,(%1)                     \n"

+      "movdqu    %%xmm3,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBCOPYALPHAROW_SSE2

 #ifdef HAS_ARGBCOPYALPHAROW_AVX2

 // width in pixels

-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"

-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"

-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"

-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"

-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2"

-  );

+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"

+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm1                     \n"

+      "vmovdqu   0x20(%0),%%ymm2                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"

+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"

+      "vmovdqu   %%ymm1,(%1)                     \n"

+      "vmovdqu   %%ymm2,0x20(%1)                 \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

 #endif  // HAS_ARGBCOPYALPHAROW_AVX2

 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2

 // width in pixels

-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {

- asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ", %%xmm0        \n"

-    "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"

-    "lea       " MEMLEA(0x20, 0) ", %0         \n"

-    "psrld     $0x18, %%xmm0                   \n"

-    "psrld     $0x18, %%xmm1                   \n"

-    "packssdw  %%xmm1, %%xmm0                  \n"

-    "packuswb  %%xmm0, %%xmm0                  \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x8, 1) ", %1          \n"

-    "sub       $0x8, %2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_a),     // %1

-    "+rm"(width)     // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1"

-  );

+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,

+                              uint8_t* dst_a,

+                              int width) {

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0), %%xmm0                    \n"

+      "movdqu    0x10(%0), %%xmm1                \n"

+      "lea       0x20(%0), %0                    \n"

+      "psrld     $0x18, %%xmm0                   \n"

+      "psrld     $0x18, %%xmm1                   \n"

+      "packssdw  %%xmm1, %%xmm0                  \n"

+      "packuswb  %%xmm0, %%xmm0                  \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "lea       0x8(%1), %1                     \n"

+      "sub       $0x8, %2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_a),     // %1

+        "+rm"(width)     // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1");

 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2

+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2

+static const uvec8 kShuffleAlphaShort_AVX2 = {

+    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,

+    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};

+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,

+                              uint8_t* dst_a,

+                              int width) {

+  asm volatile(

+      "vmovdqa    %3,%%ymm4                      \n"

+      "vbroadcastf128 %4,%%ymm5                  \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0), %%ymm0                    \n"

+      "vmovdqu   0x20(%0), %%ymm1                \n"

+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0

+      "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"

+      "vmovdqu   0x40(%0), %%ymm2                \n"

+      "vmovdqu   0x60(%0), %%ymm3                \n"

+      "lea       0x80(%0), %0                    \n"

+      "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates

+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"

+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"

+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates

+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.

+      "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub        $0x20, %2                      \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb),               // %0

+        "+r"(dst_a),                  // %1

+        "+rm"(width)                  // %2

+      : "m"(kPermdARGBToY_AVX),       // %3

+        "m"(kShuffleAlphaShort_AVX2)  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

+}

+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2

 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

 // width in pixels

-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm0,%%xmm0                   \n"

-    "pslld     $0x18,%%xmm0                    \n"

-    "pcmpeqb   %%xmm1,%%xmm1                   \n"

-    "psrld     $0x8,%%xmm1                     \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movq      " MEMACCESS(0) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "punpcklbw %%xmm2,%%xmm2                   \n"

-    "punpckhwd %%xmm2,%%xmm3                   \n"

-    "punpcklwd %%xmm2,%%xmm2                   \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"

-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"

-    "pand      %%xmm0,%%xmm2                   \n"

-    "pand      %%xmm0,%%xmm3                   \n"

-    "pand      %%xmm1,%%xmm4                   \n"

-    "pand      %%xmm1,%%xmm5                   \n"

-    "por       %%xmm4,%%xmm2                   \n"

-    "por       %%xmm5,%%xmm3                   \n"

-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm0,%%xmm0                   \n"

+      "pslld     $0x18,%%xmm0                    \n"

+      "pcmpeqb   %%xmm1,%%xmm1                   \n"

+      "psrld     $0x8,%%xmm1                     \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movq      (%0),%%xmm2                     \n"

+      "lea       0x8(%0),%0                      \n"

+      "punpcklbw %%xmm2,%%xmm2                   \n"

+      "punpckhwd %%xmm2,%%xmm3                   \n"

+      "punpcklwd %%xmm2,%%xmm2                   \n"

+      "movdqu    (%1),%%xmm4                     \n"

+      "movdqu    0x10(%1),%%xmm5                 \n"

+      "pand      %%xmm0,%%xmm2                   \n"

+      "pand      %%xmm0,%%xmm3                   \n"

+      "pand      %%xmm1,%%xmm4                   \n"

+      "pand      %%xmm1,%%xmm5                   \n"

+      "por       %%xmm4,%%xmm2                   \n"

+      "por       %%xmm5,%%xmm3                   \n"

+      "movdqu    %%xmm2,(%1)                     \n"

+      "movdqu    %%xmm3,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2

 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2

 // width in pixels

-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"

-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"

-    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "vpslld    $0x18,%%ymm1,%%ymm1             \n"

-    "vpslld    $0x18,%%ymm2,%%ymm2             \n"

-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"

-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"

-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"

-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2"

-  );

+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"

+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vpmovzxbd (%0),%%ymm1                     \n"

+      "vpmovzxbd 0x8(%0),%%ymm2                  \n"

+      "lea       0x10(%0),%0                     \n"

+      "vpslld    $0x18,%%ymm1,%%ymm1             \n"

+      "vpslld    $0x18,%%ymm2,%%ymm2             \n"

+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"

+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"

+      "vmovdqu   %%ymm1,(%1)                     \n"

+      "vmovdqu   %%ymm2,0x20(%1)                 \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2

 #ifdef HAS_SETROW_X86

-void SetRow_X86(uint8* dst, uint8 v8, int width) {

+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {

   size_t width_tmp = (size_t)(width >> 2);

-  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.

-  asm volatile (

-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"

-    : "+D"(dst),       // %0

-      "+c"(width_tmp)  // %1

-    : "a"(v32)         // %2

-    : "memory", "cc");

+  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.

+  asm volatile(

+      "rep stosl                      \n"

+      : "+D"(dst),       // %0

+        "+c"(width_tmp)  // %1

+      : "a"(v32)         // %2

+      : "memory", "cc");

-void SetRow_ERMS(uint8* dst, uint8 v8, int width) {

+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {

   size_t width_tmp = (size_t)(width);

-  asm volatile (

-    "rep stosb " MEMSTORESTRING(al,0) "        \n"

-    : "+D"(dst),       // %0

-      "+c"(width_tmp)  // %1

-    : "a"(v8)          // %2

-    : "memory", "cc");

+  asm volatile(

+      "rep stosb                      \n"

+      : "+D"(dst),       // %0

+        "+c"(width_tmp)  // %1

+      : "a"(v8)          // %2

+      : "memory", "cc");

-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {

+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {

   size_t width_tmp = (size_t)(width);

-  asm volatile (

-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"

-    : "+D"(dst_argb),  // %0

-      "+c"(width_tmp)  // %1

-    : "a"(v32)         // %2

-    : "memory", "cc");

+  asm volatile(

+      "rep stosl                      \n"

+      : "+D"(dst_argb),  // %0

+        "+c"(width_tmp)  // %1

+      : "a"(v32)         // %2

+      : "memory", "cc");

 #endif  // HAS_SETROW_X86

 #ifdef HAS_YUY2TOYROW_SSE2

-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrlw     $0x8,%%xmm5                     \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_yuy2),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm5"

-  );

+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "psrlw     $0x8,%%xmm5                     \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "pand      %%xmm5,%%xmm1                   \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrlw     $0x8,%%xmm5                     \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2

-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "pavgb     %%xmm3,%%xmm1                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm1                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_yuy2),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  : "r"((intptr_t)(stride_yuy2))  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,

+                      int stride_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "psrlw     $0x8,%%xmm5                     \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"

+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"

+      "lea       0x20(%0),%0                     \n"

+      "pavgb     %%xmm2,%%xmm0                   \n"

+      "pavgb     %%xmm3,%%xmm1                   \n"

+      "psrlw     $0x8,%%xmm0                     \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm1                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "movq    %%xmm1,0x00(%1,%2,1)              \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_yuy2),               // %0

+        "+r"(dst_u),                  // %1

+        "+r"(dst_v),                  // %2

+        "+r"(width)                   // %3

+      : "r"((intptr_t)(stride_yuy2))  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrlw     $0x8,%%xmm5                     \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm1                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_yuy2),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm5"

-  );

+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "psrlw     $0x8,%%xmm5                     \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "psrlw     $0x8,%%xmm0                     \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm1                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "movq    %%xmm1,0x00(%1,%2,1)              \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_uyvy),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1"

-  );

+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "psrlw     $0x8,%%xmm0                     \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1");

-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrlw     $0x8,%%xmm5                     \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2

-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "pavgb     %%xmm3,%%xmm1                   \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm1                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_uyvy),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  : "r"((intptr_t)(stride_uyvy))  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "psrlw     $0x8,%%xmm5                     \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"

+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"

+      "lea       0x20(%0),%0                     \n"

+      "pavgb     %%xmm2,%%xmm0                   \n"

+      "pavgb     %%xmm3,%%xmm1                   \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "pand      %%xmm5,%%xmm1                   \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm1                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "movq    %%xmm1,0x00(%1,%2,1)              \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_uyvy),               // %0

+        "+r"(dst_u),                  // %1

+        "+r"(dst_v),                  // %2

+        "+r"(width)                   // %3

+      : "r"((intptr_t)(stride_uyvy))  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrlw     $0x8,%%xmm5                     \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm1                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_uyvy),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm5"

-  );

+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "psrlw     $0x8,%%xmm5                     \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "pand      %%xmm5,%%xmm1                   \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm1                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "movq    %%xmm1,0x00(%1,%2,1)              \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

 #endif  // HAS_YUY2TOYROW_SSE2

 #ifdef HAS_YUY2TOYROW_AVX2

-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {

-  asm volatile (

-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"

-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"

-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"

-    "lea      " MEMLEA(0x20,1) ",%1            \n"

-    "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_yuy2),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm5"

-  );

+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"

+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"

+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vmovdqu   %%ymm0,(%1)                     \n"

+      "lea      0x20(%1),%1                      \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0

-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"

-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"

-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)

-    "lea      " MEMLEA(0x10,1) ",%1            \n"

-    "sub       $0x20,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_yuy2),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  : "r"((intptr_t)(stride_yuy2))  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm5"

-  );

+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,

+                      int stride_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  asm volatile(

+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"

+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"

+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vextractf128 $0x0,%%ymm1,(%1)             \n"

+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"

+      "lea      0x10(%1),%1                      \n"

+      "sub       $0x20,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_yuy2),               // %0

+        "+r"(dst_u),                  // %1

+        "+r"(dst_v),                  // %2

+        "+r"(width)                   // %3

+      : "r"((intptr_t)(stride_yuy2))  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"

-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"

-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)

-    "lea      " MEMLEA(0x10,1) ",%1            \n"

-    "sub       $0x20,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_yuy2),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm5"

-  );

+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width) {

+  asm volatile(

+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"

+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vextractf128 $0x0,%%ymm1,(%1)             \n"

+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"

+      "lea      0x10(%1),%1                      \n"

+      "sub       $0x20,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"

-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"

-    "lea      " MEMLEA(0x20,1) ",%1            \n"

-    "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_uyvy),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm5"

-  );

+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"

+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vmovdqu   %%ymm0,(%1)                     \n"

+      "lea      0x20(%1),%1                      \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

-    "sub       %1,%2                           \n"

+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  asm volatile(

+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"

+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"

+      "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0

-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"

-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"

-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"

-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)

-    "lea      " MEMLEA(0x10,1) ",%1            \n"

-    "sub       $0x20,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_uyvy),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  : "r"((intptr_t)(stride_uyvy))  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"

+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"

+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"

+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vextractf128 $0x0,%%ymm1,(%1)             \n"

+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"

+      "lea      0x10(%1),%1                      \n"

+      "sub       $0x20,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_uyvy),               // %0

+        "+r"(dst_u),                  // %1

+        "+r"(dst_v),                  // %2

+        "+r"(width)                   // %3

+      : "r"((intptr_t)(stride_uyvy))  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

-    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"

-    "sub       %1,%2                           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"

-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"

-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"

-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)

-    "lea      " MEMLEA(0x10,1) ",%1            \n"

-    "sub       $0x20,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_uyvy),    // %0

-    "+r"(dst_u),       // %1

-    "+r"(dst_v),       // %2

-    "+r"(width)          // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm5"

-  );

+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width) {

+  asm volatile(

+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"

+      "sub       %1,%2                           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"

+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"

+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"

+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"

+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"

+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"

+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"

+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"

+      "vextractf128 $0x0,%%ymm1,(%1)             \n"

+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"

+      "lea      0x10(%1),%1                      \n"

+      "sub       $0x20,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

 #endif  // HAS_YUY2TOYROW_AVX2

 #ifdef HAS_ARGBBLENDROW_SSSE3

 // Shuffle table for isolating alpha.

-static uvec8 kShuffleAlpha = {

-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,

-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

-};

+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,

+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};

 // Blend 8 pixels at a time

-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

-                        uint8* dst_argb, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm7,%%xmm7                   \n"

-    "psrlw     $0xf,%%xmm7                     \n"

-    "pcmpeqb   %%xmm6,%%xmm6                   \n"

-    "psrlw     $0x8,%%xmm6                     \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psllw     $0x8,%%xmm5                     \n"

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "pslld     $0x18,%%xmm4                    \n"

-    "sub       $0x4,%3                         \n"

-    "jl        49f                             \n"

+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,

+                        const uint8_t* src_argb1,

+                        uint8_t* dst_argb,

+                        int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm7,%%xmm7                   \n"

+      "psrlw     $0xf,%%xmm7                     \n"

+      "pcmpeqb   %%xmm6,%%xmm6                   \n"

+      "psrlw     $0x8,%%xmm6                     \n"

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "psllw     $0x8,%%xmm5                     \n"

+      "pcmpeqb   %%xmm4,%%xmm4                   \n"

+      "pslld     $0x18,%%xmm4                    \n"

+      "sub       $0x4,%3                         \n"

+      "jl        49f                             \n"

-    // 4 pixel loop.

-    LABELALIGN

-  "40:                                         \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm3,%%xmm0                   \n"

-    "pxor      %%xmm4,%%xmm3                   \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"

-    "pshufb    %4,%%xmm3                       \n"

-    "pand      %%xmm6,%%xmm2                   \n"

-    "paddw     %%xmm7,%%xmm3                   \n"

-    "pmullw    %%xmm3,%%xmm2                   \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "por       %%xmm4,%%xmm0                   \n"

-    "pmullw    %%xmm3,%%xmm1                   \n"

-    "psrlw     $0x8,%%xmm2                     \n"

-    "paddusb   %%xmm2,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jge       40b                             \n"

+      // 4 pixel loop.

+      LABELALIGN

+      "40:                                       \n"

+      "movdqu    (%0),%%xmm3                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqa    %%xmm3,%%xmm0                   \n"

+      "pxor      %%xmm4,%%xmm3                   \n"

+      "movdqu    (%1),%%xmm2                     \n"

+      "pshufb    %4,%%xmm3                       \n"

+      "pand      %%xmm6,%%xmm2                   \n"

+      "paddw     %%xmm7,%%xmm3                   \n"

+      "pmullw    %%xmm3,%%xmm2                   \n"

+      "movdqu    (%1),%%xmm1                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "por       %%xmm4,%%xmm0                   \n"

+      "pmullw    %%xmm3,%%xmm1                   \n"

+      "psrlw     $0x8,%%xmm2                     \n"

+      "paddusb   %%xmm2,%%xmm0                   \n"

+      "pand      %%xmm5,%%xmm1                   \n"

+      "paddusb   %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jge       40b                             \n"

-  "49:                                         \n"

-    "add       $0x3,%3                         \n"

-    "jl        99f                             \n"

+      "49:                                       \n"

+      "add       $0x3,%3                         \n"

+      "jl        99f                             \n"

-    // 1 pixel loop.

-  "91:                                         \n"

-    "movd      " MEMACCESS(0) ",%%xmm3         \n"

-    "lea       " MEMLEA(0x4,0) ",%0            \n"

-    "movdqa    %%xmm3,%%xmm0                   \n"

-    "pxor      %%xmm4,%%xmm3                   \n"

-    "movd      " MEMACCESS(1) ",%%xmm2         \n"

-    "pshufb    %4,%%xmm3                       \n"

-    "pand      %%xmm6,%%xmm2                   \n"

-    "paddw     %%xmm7,%%xmm3                   \n"

-    "pmullw    %%xmm3,%%xmm2                   \n"

-    "movd      " MEMACCESS(1) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x4,1) ",%1            \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "por       %%xmm4,%%xmm0                   \n"

-    "pmullw    %%xmm3,%%xmm1                   \n"

-    "psrlw     $0x8,%%xmm2                     \n"

-    "paddusb   %%xmm2,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movd      %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x4,2) ",%2            \n"

-    "sub       $0x1,%3                         \n"

-    "jge       91b                             \n"

-  "99:                                         \n"

-  : "+r"(src_argb0),    // %0

-    "+r"(src_argb1),    // %1

-    "+r"(dst_argb),     // %2

-    "+r"(width)         // %3

-  : "m"(kShuffleAlpha)  // %4

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      // 1 pixel loop.

+      "91:                                       \n"

+      "movd      (%0),%%xmm3                     \n"

+      "lea       0x4(%0),%0                      \n"

+      "movdqa    %%xmm3,%%xmm0                   \n"

+      "pxor      %%xmm4,%%xmm3                   \n"

+      "movd      (%1),%%xmm2                     \n"

+      "pshufb    %4,%%xmm3                       \n"

+      "pand      %%xmm6,%%xmm2                   \n"

+      "paddw     %%xmm7,%%xmm3                   \n"

+      "pmullw    %%xmm3,%%xmm2                   \n"

+      "movd      (%1),%%xmm1                     \n"

+      "lea       0x4(%1),%1                      \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "por       %%xmm4,%%xmm0                   \n"

+      "pmullw    %%xmm3,%%xmm1                   \n"

+      "psrlw     $0x8,%%xmm2                     \n"

+      "paddusb   %%xmm2,%%xmm0                   \n"

+      "pand      %%xmm5,%%xmm1                   \n"

+      "paddusb   %%xmm1,%%xmm0                   \n"

+      "movd      %%xmm0,(%2)                     \n"

+      "lea       0x4(%2),%2                      \n"

+      "sub       $0x1,%3                         \n"

+      "jge       91b                             \n"

+      "99:                                       \n"

+      : "+r"(src_argb0),    // %0

+        "+r"(src_argb1),    // %1

+        "+r"(dst_argb),     // %2

+        "+r"(width)         // %3

+      : "m"(kShuffleAlpha)  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBBLENDROW_SSSE3

@@ -3450,46 +4580,49 @@

 // =((A2*C2)+(B2*(255-C2))+255)/256

 // signed version of math

 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

-                         const uint8* alpha, uint8* dst, int width) {

-  asm volatile (

-    "pcmpeqb    %%xmm5,%%xmm5                  \n"

-    "psllw      $0x8,%%xmm5                    \n"

-    "mov        $0x80808080,%%eax              \n"

-    "movd       %%eax,%%xmm6                   \n"

-    "pshufd     $0x0,%%xmm6,%%xmm6             \n"

-    "mov        $0x807f807f,%%eax              \n"

-    "movd       %%eax,%%xmm7                   \n"

-    "pshufd     $0x0,%%xmm7,%%xmm7             \n"

-    "sub        %2,%0                          \n"

-    "sub        %2,%1                          \n"

-    "sub        %2,%3                          \n"

+void BlendPlaneRow_SSSE3(const uint8_t* src0,

+                         const uint8_t* src1,

+                         const uint8_t* alpha,

+                         uint8_t* dst,

+                         int width) {

+  asm volatile(

+      "pcmpeqb    %%xmm5,%%xmm5                  \n"

+      "psllw      $0x8,%%xmm5                    \n"

+      "mov        $0x80808080,%%eax              \n"

+      "movd       %%eax,%%xmm6                   \n"

+      "pshufd     $0x0,%%xmm6,%%xmm6             \n"

+      "mov        $0x807f807f,%%eax              \n"

+      "movd       %%eax,%%xmm7                   \n"

+      "pshufd     $0x0,%%xmm7,%%xmm7             \n"

+      "sub        %2,%0                          \n"

+      "sub        %2,%1                          \n"

+      "sub        %2,%3                          \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movq       (%2),%%xmm0                    \n"

-    "punpcklbw  %%xmm0,%%xmm0                  \n"

-    "pxor       %%xmm5,%%xmm0                  \n"

-    "movq       (%0,%2,1),%%xmm1               \n"

-    "movq       (%1,%2,1),%%xmm2               \n"

-    "punpcklbw  %%xmm2,%%xmm1                  \n"

-    "psubb      %%xmm6,%%xmm1                  \n"

-    "pmaddubsw  %%xmm1,%%xmm0                  \n"

-    "paddw      %%xmm7,%%xmm0                  \n"

-    "psrlw      $0x8,%%xmm0                    \n"

-    "packuswb   %%xmm0,%%xmm0                  \n"

-    "movq       %%xmm0,(%3,%2,1)               \n"

-    "lea        0x8(%2),%2                     \n"

-    "sub        $0x8,%4                        \n"

-    "jg        1b                              \n"

-  : "+r"(src0),       // %0

-    "+r"(src1),       // %1

-    "+r"(alpha),      // %2

-    "+r"(dst),        // %3

-    "+rm"(width)      // %4

-  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movq       (%2),%%xmm0                    \n"

+      "punpcklbw  %%xmm0,%%xmm0                  \n"

+      "pxor       %%xmm5,%%xmm0                  \n"

+      "movq       (%0,%2,1),%%xmm1               \n"

+      "movq       (%1,%2,1),%%xmm2               \n"

+      "punpcklbw  %%xmm2,%%xmm1                  \n"

+      "psubb      %%xmm6,%%xmm1                  \n"

+      "pmaddubsw  %%xmm1,%%xmm0                  \n"

+      "paddw      %%xmm7,%%xmm0                  \n"

+      "psrlw      $0x8,%%xmm0                    \n"

+      "packuswb   %%xmm0,%%xmm0                  \n"

+      "movq       %%xmm0,(%3,%2,1)               \n"

+      "lea        0x8(%2),%2                     \n"

+      "sub        $0x8,%4                        \n"

+      "jg        1b                              \n"

+      : "+r"(src0),   // %0

+        "+r"(src1),   // %1

+        "+r"(alpha),  // %2

+        "+r"(dst),    // %3

+        "+rm"(width)  // %4

+        ::"memory",

+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");

 #endif  // HAS_BLENDPLANEROW_SSSE3

@@ -3499,196 +4632,195 @@

 // =((A2*C2)+(B2*(255-C2))+255)/256

 // signed version of math

 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

-                        const uint8* alpha, uint8* dst, int width) {

-  asm volatile (

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

-    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"

-    "mov        $0x80808080,%%eax              \n"

-    "vmovd      %%eax,%%xmm6                   \n"

-    "vbroadcastss %%xmm6,%%ymm6                \n"

-    "mov        $0x807f807f,%%eax              \n"

-    "vmovd      %%eax,%%xmm7                   \n"

-    "vbroadcastss %%xmm7,%%ymm7                \n"

-    "sub        %2,%0                          \n"

-    "sub        %2,%1                          \n"

-    "sub        %2,%3                          \n"

+void BlendPlaneRow_AVX2(const uint8_t* src0,

+                        const uint8_t* src1,

+                        const uint8_t* alpha,

+                        uint8_t* dst,

+                        int width) {

+  asm volatile(

+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"

+      "mov        $0x80808080,%%eax              \n"

+      "vmovd      %%eax,%%xmm6                   \n"

+      "vbroadcastss %%xmm6,%%ymm6                \n"

+      "mov        $0x807f807f,%%eax              \n"

+      "vmovd      %%eax,%%xmm7                   \n"

+      "vbroadcastss %%xmm7,%%ymm7                \n"

+      "sub        %2,%0                          \n"

+      "sub        %2,%1                          \n"

+      "sub        %2,%3                          \n"

-    // 32 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    (%2),%%ymm0                    \n"

-    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"

-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"

-    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"

-    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"

-    "vmovdqu    (%0,%2,1),%%ymm1               \n"

-    "vmovdqu    (%1,%2,1),%%ymm2               \n"

-    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"

-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"

-    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"

-    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"

-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

-    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"

-    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"

-    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"

-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

-    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"

-    "vmovdqu    %%ymm0,(%3,%2,1)               \n"

-    "lea        0x20(%2),%2                    \n"

-    "sub        $0x20,%4                       \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src0),       // %0

-    "+r"(src1),       // %1

-    "+r"(alpha),      // %2

-    "+r"(dst),        // %3

-    "+rm"(width)      // %4

-  :: "memory", "cc", "eax",

-     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      // 32 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%2),%%ymm0                    \n"

+      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"

+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"

+      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"

+      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"

+      "vmovdqu    (%0,%2,1),%%ymm1               \n"

+      "vmovdqu    (%1,%2,1),%%ymm2               \n"

+      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"

+      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"

+      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"

+      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"

+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"

+      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"

+      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"

+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"

+      "vmovdqu    %%ymm0,(%3,%2,1)               \n"

+      "lea        0x20(%2),%2                    \n"

+      "sub        $0x20,%4                       \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src0),   // %0

+        "+r"(src1),   // %1

+        "+r"(alpha),  // %2

+        "+r"(dst),    // %3

+        "+rm"(width)  // %4

+        ::"memory",

+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_BLENDPLANEROW_AVX2

 #ifdef HAS_ARGBATTENUATEROW_SSSE3

 // Shuffle table duplicating alpha

-static uvec8 kShuffleAlpha0 = {

-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u

-};

-static uvec8 kShuffleAlpha1 = {

-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u

-};

+static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,

+                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};

+static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

+                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};

 // Attenuate 4 pixels at a time.

-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

-  asm volatile (

-    "pcmpeqb   %%xmm3,%%xmm3                   \n"

-    "pslld     $0x18,%%xmm3                    \n"

-    "movdqa    %3,%%xmm4                       \n"

-    "movdqa    %4,%%xmm5                       \n"

+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            int width) {

+  asm volatile(

+      "pcmpeqb   %%xmm3,%%xmm3                   \n"

+      "pslld     $0x18,%%xmm3                    \n"

+      "movdqa    %3,%%xmm4                       \n"

+      "movdqa    %4,%%xmm5                       \n"

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "pshufb    %%xmm4,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "punpcklbw %%xmm1,%%xmm1                   \n"

-    "pmulhuw   %%xmm1,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "pshufb    %%xmm5,%%xmm1                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

-    "punpckhbw %%xmm2,%%xmm2                   \n"

-    "pmulhuw   %%xmm2,%%xmm1                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "pand      %%xmm3,%%xmm2                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "por       %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)        // %2

-  : "m"(kShuffleAlpha0),  // %3

-    "m"(kShuffleAlpha1)  // %4

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "pshufb    %%xmm4,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm1                     \n"

+      "punpcklbw %%xmm1,%%xmm1                   \n"

+      "pmulhuw   %%xmm1,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm1                     \n"

+      "pshufb    %%xmm5,%%xmm1                   \n"

+      "movdqu    (%0),%%xmm2                     \n"

+      "punpckhbw %%xmm2,%%xmm2                   \n"

+      "pmulhuw   %%xmm2,%%xmm1                   \n"

+      "movdqu    (%0),%%xmm2                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "pand      %%xmm3,%%xmm2                   \n"

+      "psrlw     $0x8,%%xmm0                     \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "por       %%xmm2,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),       // %0

+        "+r"(dst_argb),       // %1

+        "+r"(width)           // %2

+      : "m"(kShuffleAlpha0),  // %3

+        "m"(kShuffleAlpha1)   // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBATTENUATEROW_SSSE3

 #ifdef HAS_ARGBATTENUATEROW_AVX2

 // Shuffle table duplicating alpha.

-static const uvec8 kShuffleAlpha_AVX2 = {

-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u

-};

+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,

+                                         128u, 128u, 14u,  15u, 14u, 15u,

+                                         14u,  15u,  128u, 128u};

 // Attenuate 8 pixels at a time.

-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {

-  asm volatile (

-    "vbroadcastf128 %3,%%ymm4                  \n"

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

-    "vpslld     $0x18,%%ymm5,%%ymm5            \n"

-    "sub        %0,%1                          \n"

+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,

+                           uint8_t* dst_argb,

+                           int width) {

+  asm volatile(

+      "vbroadcastf128 %3,%%ymm4                  \n"

+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+      "vpslld     $0x18,%%ymm5,%%ymm5            \n"

+      "sub        %0,%1                          \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"

-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"

-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"

-    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"

-    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"

-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"

-    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"

-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"

-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "sub        $0x8,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)        // %2

-  : "m"(kShuffleAlpha_AVX2)  // %3

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm6                    \n"

+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"

+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"

+      "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"

+      "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"

+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"

+      "vpand      %%ymm5,%%ymm6,%%ymm6           \n"

+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpor       %%ymm6,%%ymm0,%%ymm0           \n"

+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"

+      "lea       0x20(%0),%0                     \n"

+      "sub        $0x8,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb),          // %0

+        "+r"(dst_argb),          // %1

+        "+r"(width)              // %2

+      : "m"(kShuffleAlpha_AVX2)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

 #endif  // HAS_ARGBATTENUATEROW_AVX2

 #ifdef HAS_ARGBUNATTENUATEROW_SSE2

 // Unattenuate 4 pixels at a time.

-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,

+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

                              int width) {

   uintptr_t alpha;

-  asm volatile (

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2

-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"

-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3

-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

-    "movlhps   %%xmm3,%%xmm2                   \n"

-    "pmulhuw   %%xmm2,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"

-    "punpckhbw %%xmm1,%%xmm1                   \n"

-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2

-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"

-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3

-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

-    "movlhps   %%xmm3,%%xmm2                   \n"

-    "pmulhuw   %%xmm2,%%xmm1                   \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),     // %0

-    "+r"(dst_argb),     // %1

-    "+r"(width),        // %2

-    "=&r"(alpha)        // %3

-  : "r"(fixed_invtbl8)  // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+  asm volatile(

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movzb     0x03(%0),%3                     \n"

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "movd      0x00(%4,%3,4),%%xmm2            \n"

+      "movzb     0x07(%0),%3                     \n"

+      "movd      0x00(%4,%3,4),%%xmm3            \n"

+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

+      "movlhps   %%xmm3,%%xmm2                   \n"

+      "pmulhuw   %%xmm2,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm1                     \n"

+      "movzb     0x0b(%0),%3                     \n"

+      "punpckhbw %%xmm1,%%xmm1                   \n"

+      "movd      0x00(%4,%3,4),%%xmm2            \n"

+      "movzb     0x0f(%0),%3                     \n"

+      "movd      0x00(%4,%3,4),%%xmm3            \n"

+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

+      "movlhps   %%xmm3,%%xmm2                   \n"

+      "pmulhuw   %%xmm2,%%xmm1                   \n"

+      "lea       0x10(%0),%0                     \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),     // %0

+        "+r"(dst_argb),     // %1

+        "+r"(width),        // %2

+        "=&r"(alpha)        // %3

+      : "r"(fixed_invtbl8)  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBUNATTENUATEROW_SSE2

@@ -3695,116 +4827,113 @@

 #ifdef HAS_ARGBUNATTENUATEROW_AVX2

 // Shuffle table duplicating alpha.

 static const uvec8 kUnattenShuffleAlpha_AVX2 = {

-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u

-};

+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};

 // Unattenuate 8 pixels at a time.

-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

                              int width) {

   uintptr_t alpha;

-  asm volatile (

-    "sub        %0,%1                          \n"

-    "vbroadcastf128 %5,%%ymm5                  \n"

+  asm volatile(

+      "sub        %0,%1                          \n"

+      "vbroadcastf128 %5,%%ymm5                  \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    // replace VPGATHER

-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0

-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1

-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"

-    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2

-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3

-    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"

-    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0

-    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1

-    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"

-    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2

-    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"

-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3

-    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"

-    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"

-    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"

-    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"

-    // end of VPGATHER

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      // replace VPGATHER

+      "movzb     0x03(%0),%3                     \n"

+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"

+      "movzb     0x07(%0),%3                     \n"

+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"

+      "movzb     0x0b(%0),%3                     \n"

+      "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"

+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"

+      "movzb     0x0f(%0),%3                     \n"

+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"

+      "movzb     0x13(%0),%3                     \n"

+      "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"

+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"

+      "movzb     0x17(%0),%3                     \n"

+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"

+      "movzb     0x1b(%0),%3                     \n"

+      "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"

+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"

+      "movzb     0x1f(%0),%3                     \n"

+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"

+      "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"

+      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"

+      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"

+      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"

+      // end of VPGATHER

-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"

-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"

-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"

-    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"

-    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"

-    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"

-    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"

-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "sub        $0x8,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb),      // %0

-    "+r"(dst_argb),      // %1

-    "+r"(width),         // %2

-    "=&r"(alpha)         // %3

-  : "r"(fixed_invtbl8),  // %4

-    "m"(kUnattenShuffleAlpha_AVX2)  // %5

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      "vmovdqu    (%0),%%ymm6                    \n"

+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"

+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"

+      "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"

+      "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"

+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"

+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"

+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"

+      "lea       0x20(%0),%0                     \n"

+      "sub        $0x8,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb),                 // %0

+        "+r"(dst_argb),                 // %1

+        "+r"(width),                    // %2

+        "=&r"(alpha)                    // %3

+      : "r"(fixed_invtbl8),             // %4

+        "m"(kUnattenShuffleAlpha_AVX2)  // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBUNATTENUATEROW_AVX2

 #ifdef HAS_ARGBGRAYROW_SSSE3

 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels

-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

-  asm volatile (

-    "movdqa    %3,%%xmm4                       \n"

-    "movdqa    %4,%%xmm5                       \n"

+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "movdqa    %3,%%xmm4                       \n"

+      "movdqa    %4,%%xmm5                       \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "pmaddubsw %%xmm4,%%xmm0                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "phaddw    %%xmm1,%%xmm0                   \n"

-    "paddw     %%xmm5,%%xmm0                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "psrld     $0x18,%%xmm2                    \n"

-    "psrld     $0x18,%%xmm3                    \n"

-    "packuswb  %%xmm3,%%xmm2                   \n"

-    "packuswb  %%xmm2,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm3                   \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "punpcklbw %%xmm2,%%xmm3                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklwd %%xmm3,%%xmm0                   \n"

-    "punpckhwd %%xmm3,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_argb),   // %1

-    "+r"(width)       // %2

-  : "m"(kARGBToYJ),   // %3

-    "m"(kAddYJ64)     // %4

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "pmaddubsw %%xmm4,%%xmm0                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "phaddw    %%xmm1,%%xmm0                   \n"

+      "paddw     %%xmm5,%%xmm0                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm2                     \n"

+      "movdqu    0x10(%0),%%xmm3                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "psrld     $0x18,%%xmm2                    \n"

+      "psrld     $0x18,%%xmm3                    \n"

+      "packuswb  %%xmm3,%%xmm2                   \n"

+      "packuswb  %%xmm2,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm3                   \n"

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "punpcklbw %%xmm2,%%xmm3                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpcklwd %%xmm3,%%xmm0                   \n"

+      "punpckhwd %%xmm3,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "m"(kARGBToYJ),  // %3

+        "m"(kAddYJ64)    // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBGRAYROW_SSSE3

@@ -3813,73 +4942,68 @@

 //    g = (r * 45 + g * 88 + b * 22) >> 7

 //    r = (r * 50 + g * 98 + b * 24) >> 7

 // Constant for ARGB color to sepia tone

-static vec8 kARGBToSepiaB = {

-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0

-};

+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,

+                                   17, 68, 35, 0, 17, 68, 35, 0};

-static vec8 kARGBToSepiaG = {

-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0

-};

+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,

+                                   22, 88, 45, 0, 22, 88, 45, 0};

-static vec8 kARGBToSepiaR = {

-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0

-};

+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,

+                                   24, 98, 50, 0, 24, 98, 50, 0};

 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {

-  asm volatile (

-    "movdqa    %2,%%xmm2                       \n"

-    "movdqa    %3,%%xmm3                       \n"

-    "movdqa    %4,%%xmm4                       \n"

+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {

+  asm volatile(

+      "movdqa    %2,%%xmm2                       \n"

+      "movdqa    %3,%%xmm3                       \n"

+      "movdqa    %4,%%xmm4                       \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

-    "pmaddubsw %%xmm2,%%xmm0                   \n"

-    "pmaddubsw %%xmm2,%%xmm6                   \n"

-    "phaddw    %%xmm6,%%xmm0                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "pmaddubsw %%xmm3,%%xmm5                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "phaddw    %%xmm1,%%xmm5                   \n"

-    "psrlw     $0x7,%%xmm5                     \n"

-    "packuswb  %%xmm5,%%xmm5                   \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "pmaddubsw %%xmm4,%%xmm5                   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "phaddw    %%xmm1,%%xmm5                   \n"

-    "psrlw     $0x7,%%xmm5                     \n"

-    "packuswb  %%xmm5,%%xmm5                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "psrld     $0x18,%%xmm6                    \n"

-    "psrld     $0x18,%%xmm1                    \n"

-    "packuswb  %%xmm1,%%xmm6                   \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "punpcklbw %%xmm6,%%xmm5                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklwd %%xmm5,%%xmm0                   \n"

-    "punpckhwd %%xmm5,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "sub       $0x8,%1                         \n"

-    "jg        1b                              \n"

-  : "+r"(dst_argb),      // %0

-    "+r"(width)          // %1

-  : "m"(kARGBToSepiaB),  // %2

-    "m"(kARGBToSepiaG),  // %3

-    "m"(kARGBToSepiaR)   // %4

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm6                 \n"

+      "pmaddubsw %%xmm2,%%xmm0                   \n"

+      "pmaddubsw %%xmm2,%%xmm6                   \n"

+      "phaddw    %%xmm6,%%xmm0                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm5                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "pmaddubsw %%xmm3,%%xmm5                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "phaddw    %%xmm1,%%xmm5                   \n"

+      "psrlw     $0x7,%%xmm5                     \n"

+      "packuswb  %%xmm5,%%xmm5                   \n"

+      "punpcklbw %%xmm5,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm5                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "pmaddubsw %%xmm4,%%xmm5                   \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "phaddw    %%xmm1,%%xmm5                   \n"

+      "psrlw     $0x7,%%xmm5                     \n"

+      "packuswb  %%xmm5,%%xmm5                   \n"

+      "movdqu    (%0),%%xmm6                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "psrld     $0x18,%%xmm6                    \n"

+      "psrld     $0x18,%%xmm1                    \n"

+      "packuswb  %%xmm1,%%xmm6                   \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "punpcklbw %%xmm6,%%xmm5                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpcklwd %%xmm5,%%xmm0                   \n"

+      "punpckhwd %%xmm5,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%0)                     \n"

+      "movdqu    %%xmm1,0x10(%0)                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "sub       $0x8,%1                         \n"

+      "jg        1b                              \n"

+      : "+r"(dst_argb),      // %0

+        "+r"(width)          // %1

+      : "m"(kARGBToSepiaB),  // %2

+        "m"(kARGBToSepiaG),  // %3

+        "m"(kARGBToSepiaR)   // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

 #endif  // HAS_ARGBSEPIAROW_SSSE3

@@ -3886,339 +5010,347 @@

 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3

 // Tranform 8 ARGB pixels (32 bytes) with color matrix.

 // Same as Sepia except matrix is provided.

-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                              const int8* matrix_argb, int width) {

-  asm volatile (

-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"

-    "pshufd    $0x00,%%xmm5,%%xmm2             \n"

-    "pshufd    $0x55,%%xmm5,%%xmm3             \n"

-    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"

-    "pshufd    $0xff,%%xmm5,%%xmm5             \n"

+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,

+                              uint8_t* dst_argb,

+                              const int8_t* matrix_argb,

+                              int width) {

+  asm volatile(

+      "movdqu    (%3),%%xmm5                     \n"

+      "pshufd    $0x00,%%xmm5,%%xmm2             \n"

+      "pshufd    $0x55,%%xmm5,%%xmm3             \n"

+      "pshufd    $0xaa,%%xmm5,%%xmm4             \n"

+      "pshufd    $0xff,%%xmm5,%%xmm5             \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"

-    "pmaddubsw %%xmm2,%%xmm0                   \n"

-    "pmaddubsw %%xmm2,%%xmm7                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "pmaddubsw %%xmm3,%%xmm6                   \n"

-    "pmaddubsw %%xmm3,%%xmm1                   \n"

-    "phaddsw   %%xmm7,%%xmm0                   \n"

-    "phaddsw   %%xmm1,%%xmm6                   \n"

-    "psraw     $0x6,%%xmm0                     \n"

-    "psraw     $0x6,%%xmm6                     \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "punpcklbw %%xmm6,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"

-    "pmaddubsw %%xmm4,%%xmm1                   \n"

-    "pmaddubsw %%xmm4,%%xmm7                   \n"

-    "phaddsw   %%xmm7,%%xmm1                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"

-    "pmaddubsw %%xmm5,%%xmm6                   \n"

-    "pmaddubsw %%xmm5,%%xmm7                   \n"

-    "phaddsw   %%xmm7,%%xmm6                   \n"

-    "psraw     $0x6,%%xmm1                     \n"

-    "psraw     $0x6,%%xmm6                     \n"

-    "packuswb  %%xmm1,%%xmm1                   \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "punpcklbw %%xmm6,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm6                   \n"

-    "punpcklwd %%xmm1,%%xmm0                   \n"

-    "punpckhwd %%xmm1,%%xmm6                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),      // %0

-    "+r"(dst_argb),      // %1

-    "+r"(width)          // %2

-  : "r"(matrix_argb)     // %3

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm7                 \n"

+      "pmaddubsw %%xmm2,%%xmm0                   \n"

+      "pmaddubsw %%xmm2,%%xmm7                   \n"

+      "movdqu    (%0),%%xmm6                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "pmaddubsw %%xmm3,%%xmm6                   \n"

+      "pmaddubsw %%xmm3,%%xmm1                   \n"

+      "phaddsw   %%xmm7,%%xmm0                   \n"

+      "phaddsw   %%xmm1,%%xmm6                   \n"

+      "psraw     $0x6,%%xmm0                     \n"

+      "psraw     $0x6,%%xmm6                     \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "punpcklbw %%xmm6,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm1                     \n"

+      "movdqu    0x10(%0),%%xmm7                 \n"

+      "pmaddubsw %%xmm4,%%xmm1                   \n"

+      "pmaddubsw %%xmm4,%%xmm7                   \n"

+      "phaddsw   %%xmm7,%%xmm1                   \n"

+      "movdqu    (%0),%%xmm6                     \n"

+      "movdqu    0x10(%0),%%xmm7                 \n"

+      "pmaddubsw %%xmm5,%%xmm6                   \n"

+      "pmaddubsw %%xmm5,%%xmm7                   \n"

+      "phaddsw   %%xmm7,%%xmm6                   \n"

+      "psraw     $0x6,%%xmm1                     \n"

+      "psraw     $0x6,%%xmm6                     \n"

+      "packuswb  %%xmm1,%%xmm1                   \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "punpcklbw %%xmm6,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm6                   \n"

+      "punpcklwd %%xmm1,%%xmm0                   \n"

+      "punpckhwd %%xmm1,%%xmm6                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    %%xmm6,0x10(%1)                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),   // %0

+        "+r"(dst_argb),   // %1

+        "+r"(width)       // %2

+      : "r"(matrix_argb)  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3

 #ifdef HAS_ARGBQUANTIZEROW_SSE2

 // Quantize 4 ARGB pixels (16 bytes).

-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,

-                          int interval_offset, int width) {

-  asm volatile (

-    "movd      %2,%%xmm2                       \n"

-    "movd      %3,%%xmm3                       \n"

-    "movd      %4,%%xmm4                       \n"

-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"

-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"

-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"

-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"

-    "pxor      %%xmm5,%%xmm5                   \n"

-    "pcmpeqb   %%xmm6,%%xmm6                   \n"

-    "pslld     $0x18,%%xmm6                    \n"

+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,

+                          int scale,

+                          int interval_size,

+                          int interval_offset,

+                          int width) {

+  asm volatile(

+      "movd      %2,%%xmm2                       \n"

+      "movd      %3,%%xmm3                       \n"

+      "movd      %4,%%xmm4                       \n"

+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"

+      "pshufd    $0x44,%%xmm2,%%xmm2             \n"

+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"

+      "pshufd    $0x44,%%xmm3,%%xmm3             \n"

+      "pshuflw   $0x40,%%xmm4,%%xmm4             \n"

+      "pshufd    $0x44,%%xmm4,%%xmm4             \n"

+      "pxor      %%xmm5,%%xmm5                   \n"

+      "pcmpeqb   %%xmm6,%%xmm6                   \n"

+      "pslld     $0x18,%%xmm6                    \n"

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "pmulhuw   %%xmm2,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"

-    "punpckhbw %%xmm5,%%xmm1                   \n"

-    "pmulhuw   %%xmm2,%%xmm1                   \n"

-    "pmullw    %%xmm3,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"

-    "pmullw    %%xmm3,%%xmm1                   \n"

-    "pand      %%xmm6,%%xmm7                   \n"

-    "paddw     %%xmm4,%%xmm0                   \n"

-    "paddw     %%xmm4,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "por       %%xmm7,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "sub       $0x4,%1                         \n"

-    "jg        1b                              \n"

-  : "+r"(dst_argb),       // %0

-    "+r"(width)           // %1

-  : "r"(scale),           // %2

-    "r"(interval_size),   // %3

-    "r"(interval_offset)  // %4

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "punpcklbw %%xmm5,%%xmm0                   \n"

+      "pmulhuw   %%xmm2,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm1                     \n"

+      "punpckhbw %%xmm5,%%xmm1                   \n"

+      "pmulhuw   %%xmm2,%%xmm1                   \n"

+      "pmullw    %%xmm3,%%xmm0                   \n"

+      "movdqu    (%0),%%xmm7                     \n"

+      "pmullw    %%xmm3,%%xmm1                   \n"

+      "pand      %%xmm6,%%xmm7                   \n"

+      "paddw     %%xmm4,%%xmm0                   \n"

+      "paddw     %%xmm4,%%xmm1                   \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "por       %%xmm7,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%0)                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "sub       $0x4,%1                         \n"

+      "jg        1b                              \n"

+      : "+r"(dst_argb),       // %0

+        "+r"(width)           // %1

+      : "r"(scale),           // %2

+        "r"(interval_size),   // %3

+        "r"(interval_offset)  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBQUANTIZEROW_SSE2

 #ifdef HAS_ARGBSHADEROW_SSE2

 // Shade 4 pixels at a time by specified value.

-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,

-                       uint32 value) {

-  asm volatile (

-    "movd      %3,%%xmm2                       \n"

-    "punpcklbw %%xmm2,%%xmm2                   \n"

-    "punpcklqdq %%xmm2,%%xmm2                  \n"

+void ARGBShadeRow_SSE2(const uint8_t* src_argb,

+                       uint8_t* dst_argb,

+                       int width,

+                       uint32_t value) {

+  asm volatile(

+      "movd      %3,%%xmm2                       \n"

+      "punpcklbw %%xmm2,%%xmm2                   \n"

+      "punpcklqdq %%xmm2,%%xmm2                  \n"

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "punpckhbw %%xmm1,%%xmm1                   \n"

-    "pmulhuw   %%xmm2,%%xmm0                   \n"

-    "pmulhuw   %%xmm2,%%xmm1                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)      // %2

-  : "r"(value)       // %3

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2"

-  );

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "punpckhbw %%xmm1,%%xmm1                   \n"

+      "pmulhuw   %%xmm2,%%xmm0                   \n"

+      "pmulhuw   %%xmm2,%%xmm1                   \n"

+      "psrlw     $0x8,%%xmm0                     \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "r"(value)       // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

 #endif  // HAS_ARGBSHADEROW_SSE2

 #ifdef HAS_ARGBMULTIPLYROW_SSE2

 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.

-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    "pxor      %%xmm5,%%xmm5                  \n"

+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "movdqu    %%xmm0,%%xmm1                   \n"

-    "movdqu    %%xmm2,%%xmm3                   \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "punpckhbw %%xmm1,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm2                   \n"

-    "punpckhbw %%xmm5,%%xmm3                   \n"

-    "pmulhuw   %%xmm2,%%xmm0                   \n"

-    "pmulhuw   %%xmm3,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      "pxor      %%xmm5,%%xmm5                   \n"

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqu    (%1),%%xmm2                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "movdqu    %%xmm0,%%xmm1                   \n"

+      "movdqu    %%xmm2,%%xmm3                   \n"

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "punpckhbw %%xmm1,%%xmm1                   \n"

+      "punpcklbw %%xmm5,%%xmm2                   \n"

+      "punpckhbw %%xmm5,%%xmm3                   \n"

+      "pmulhuw   %%xmm2,%%xmm0                   \n"

+      "pmulhuw   %%xmm3,%%xmm1                   \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_ARGBMULTIPLYROW_SSE2

 #ifdef HAS_ARGBMULTIPLYROW_AVX2

 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"

-    "lea        " MEMLEA(0x20,0) ",%0          \n"

-    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"

-    "lea        " MEMLEA(0x20,1) ",%1          \n"

-    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"

-    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"

-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"

-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"

-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"

-    "lea       " MEMLEA(0x20,2) ",%2           \n"

-    "sub        $0x8,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "memory", "cc"

+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm1                    \n"

+      "lea        0x20(%0),%0                    \n"

+      "vmovdqu    (%1),%%ymm3                    \n"

+      "lea        0x20(%1),%1                    \n"

+      "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"

+      "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"

+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"

+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"

+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vmovdqu    %%ymm0,(%2)                    \n"

+      "lea       0x20(%2),%2                     \n"

+      "sub        $0x8,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "memory", "cc"

 #if defined(__AVX2__)

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

+        ,

+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

 #endif

-  );

+      );

 #endif  // HAS_ARGBMULTIPLYROW_AVX2

 #ifdef HAS_ARGBADDROW_SSE2

 // Add 2 rows of ARGB pixels together, 4 pixels at a time.

-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1"

-  );

+void ARGBAddRow_SSE2(const uint8_t* src_argb0,

+                     const uint8_t* src_argb1,

+                     uint8_t* dst_argb,

+                     int width) {

+  asm volatile(

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqu    (%1),%%xmm1                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "paddusb   %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1");

 #endif  // HAS_ARGBADDROW_SSE2

 #ifdef HAS_ARGBADDROW_AVX2

 // Add 2 rows of ARGB pixels together, 4 pixels at a time.

-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "lea        " MEMLEA(0x20,0) ",%0          \n"

-    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"

-    "lea        " MEMLEA(0x20,1) ",%1          \n"

-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"

-    "lea        " MEMLEA(0x20,2) ",%2          \n"

-    "sub        $0x8,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "memory", "cc"

-    , "xmm0"

-  );

+void ARGBAddRow_AVX2(const uint8_t* src_argb0,

+                     const uint8_t* src_argb1,

+                     uint8_t* dst_argb,

+                     int width) {

+  asm volatile(

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "lea        0x20(%0),%0                    \n"

+      "vpaddusb   (%1),%%ymm0,%%ymm0             \n"

+      "lea        0x20(%1),%1                    \n"

+      "vmovdqu    %%ymm0,(%2)                    \n"

+      "lea        0x20(%2),%2                    \n"

+      "sub        $0x8,%3                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "memory", "cc", "xmm0");

 #endif  // HAS_ARGBADDROW_AVX2

 #ifdef HAS_ARGBSUBTRACTROW_SSE2

 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.

-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "psubusb   %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1"

-  );

+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqu    (%1),%%xmm1                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "psubusb   %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1");

 #endif  // HAS_ARGBSUBTRACTROW_SSE2

 #ifdef HAS_ARGBSUBTRACTROW_AVX2

 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.

-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "lea        " MEMLEA(0x20,0) ",%0          \n"

-    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"

-    "lea        " MEMLEA(0x20,1) ",%1          \n"

-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"

-    "lea        " MEMLEA(0x20,2) ",%2          \n"

-    "sub        $0x8,%3                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "memory", "cc"

-    , "xmm0"

-  );

+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "lea        0x20(%0),%0                    \n"

+      "vpsubusb   (%1),%%ymm0,%%ymm0             \n"

+      "lea        0x20(%1),%1                    \n"

+      "vmovdqu    %%ymm0,(%2)                    \n"

+      "lea        0x20(%2),%2                    \n"

+      "sub        $0x8,%3                        \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "memory", "cc", "xmm0");

 #endif  // HAS_ARGBSUBTRACTROW_AVX2

@@ -4227,52 +5359,53 @@

 // -1  0  1

 // -2  0  2

 // -1  0  1

-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,

-                    const uint8* src_y2, uint8* dst_sobelx, int width) {

-  asm volatile (

-    "sub       %0,%1                           \n"

-    "sub       %0,%2                           \n"

-    "sub       %0,%3                           \n"

-    "pxor      %%xmm5,%%xmm5                   \n"

+void SobelXRow_SSE2(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    const uint8_t* src_y2,

+                    uint8_t* dst_sobelx,

+                    int width) {

+  asm volatile(

+      "sub       %0,%1                           \n"

+      "sub       %0,%2                           \n"

+      "sub       %0,%3                           \n"

+      "pxor      %%xmm5,%%xmm5                   \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movq      " MEMACCESS(0) ",%%xmm0         \n"

-    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "punpcklbw %%xmm5,%%xmm1                   \n"

-    "psubw     %%xmm1,%%xmm0                   \n"

-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1

-    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2

-    "punpcklbw %%xmm5,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm2                   \n"

-    "psubw     %%xmm2,%%xmm1                   \n"

-    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2

-    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3

-    "punpcklbw %%xmm5,%%xmm2                   \n"

-    "punpcklbw %%xmm5,%%xmm3                   \n"

-    "psubw     %%xmm3,%%xmm2                   \n"

-    "paddw     %%xmm2,%%xmm0                   \n"

-    "paddw     %%xmm1,%%xmm0                   \n"

-    "paddw     %%xmm1,%%xmm0                   \n"

-    "pxor      %%xmm1,%%xmm1                   \n"

-    "psubw     %%xmm0,%%xmm1                   \n"

-    "pmaxsw    %%xmm1,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "sub       $0x8,%4                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_y0),      // %0

-    "+r"(src_y1),      // %1

-    "+r"(src_y2),      // %2

-    "+r"(dst_sobelx),  // %3

-    "+r"(width)        // %4

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movq      (%0),%%xmm0                     \n"

+      "movq      0x2(%0),%%xmm1                  \n"

+      "punpcklbw %%xmm5,%%xmm0                   \n"

+      "punpcklbw %%xmm5,%%xmm1                   \n"

+      "psubw     %%xmm1,%%xmm0                   \n"

+      "movq      0x00(%0,%1,1),%%xmm1            \n"

+      "movq      0x02(%0,%1,1),%%xmm2            \n"

+      "punpcklbw %%xmm5,%%xmm1                   \n"

+      "punpcklbw %%xmm5,%%xmm2                   \n"

+      "psubw     %%xmm2,%%xmm1                   \n"

+      "movq      0x00(%0,%2,1),%%xmm2            \n"

+      "movq      0x02(%0,%2,1),%%xmm3            \n"

+      "punpcklbw %%xmm5,%%xmm2                   \n"

+      "punpcklbw %%xmm5,%%xmm3                   \n"

+      "psubw     %%xmm3,%%xmm2                   \n"

+      "paddw     %%xmm2,%%xmm0                   \n"

+      "paddw     %%xmm1,%%xmm0                   \n"

+      "paddw     %%xmm1,%%xmm0                   \n"

+      "pxor      %%xmm1,%%xmm1                   \n"

+      "psubw     %%xmm0,%%xmm1                   \n"

+      "pmaxsw    %%xmm1,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movq      %%xmm0,0x00(%0,%3,1)            \n"

+      "lea       0x8(%0),%0                      \n"

+      "sub       $0x8,%4                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_y0),      // %0

+        "+r"(src_y1),      // %1

+        "+r"(src_y2),      // %2

+        "+r"(dst_sobelx),  // %3

+        "+r"(width)        // %4

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_SOBELXROW_SSE2

@@ -4281,50 +5414,50 @@

 // -1 -2 -1

 //  0  0  0

 //  1  2  1

-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,

-                    uint8* dst_sobely, int width) {

-  asm volatile (

-    "sub       %0,%1                           \n"

-    "sub       %0,%2                           \n"

-    "pxor      %%xmm5,%%xmm5                   \n"

+void SobelYRow_SSE2(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    uint8_t* dst_sobely,

+                    int width) {

+  asm volatile(

+      "sub       %0,%1                           \n"

+      "sub       %0,%2                           \n"

+      "pxor      %%xmm5,%%xmm5                   \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movq      " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "punpcklbw %%xmm5,%%xmm1                   \n"

-    "psubw     %%xmm1,%%xmm0                   \n"

-    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"

-    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2

-    "punpcklbw %%xmm5,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm2                   \n"

-    "psubw     %%xmm2,%%xmm1                   \n"

-    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"

-    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3

-    "punpcklbw %%xmm5,%%xmm2                   \n"

-    "punpcklbw %%xmm5,%%xmm3                   \n"

-    "psubw     %%xmm3,%%xmm2                   \n"

-    "paddw     %%xmm2,%%xmm0                   \n"

-    "paddw     %%xmm1,%%xmm0                   \n"

-    "paddw     %%xmm1,%%xmm0                   \n"

-    "pxor      %%xmm1,%%xmm1                   \n"

-    "psubw     %%xmm0,%%xmm1                   \n"

-    "pmaxsw    %%xmm1,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "sub       $0x8,%3                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_y0),      // %0

-    "+r"(src_y1),      // %1

-    "+r"(dst_sobely),  // %2

-    "+r"(width)        // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movq      (%0),%%xmm0                     \n"

+      "movq      0x00(%0,%1,1),%%xmm1            \n"

+      "punpcklbw %%xmm5,%%xmm0                   \n"

+      "punpcklbw %%xmm5,%%xmm1                   \n"

+      "psubw     %%xmm1,%%xmm0                   \n"

+      "movq      0x1(%0),%%xmm1                  \n"

+      "movq      0x01(%0,%1,1),%%xmm2            \n"

+      "punpcklbw %%xmm5,%%xmm1                   \n"

+      "punpcklbw %%xmm5,%%xmm2                   \n"

+      "psubw     %%xmm2,%%xmm1                   \n"

+      "movq      0x2(%0),%%xmm2                  \n"

+      "movq      0x02(%0,%1,1),%%xmm3            \n"

+      "punpcklbw %%xmm5,%%xmm2                   \n"

+      "punpcklbw %%xmm5,%%xmm3                   \n"

+      "psubw     %%xmm3,%%xmm2                   \n"

+      "paddw     %%xmm2,%%xmm0                   \n"

+      "paddw     %%xmm1,%%xmm0                   \n"

+      "paddw     %%xmm1,%%xmm0                   \n"

+      "pxor      %%xmm1,%%xmm1                   \n"

+      "psubw     %%xmm0,%%xmm1                   \n"

+      "pmaxsw    %%xmm1,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movq      %%xmm0,0x00(%0,%2,1)            \n"

+      "lea       0x8(%0),%0                      \n"

+      "sub       $0x8,%3                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_y0),      // %0

+        "+r"(src_y1),      // %1

+        "+r"(dst_sobely),  // %2

+        "+r"(width)        // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_SOBELYROW_SSE2

@@ -4334,79 +5467,79 @@

 // R = Sobel

 // G = Sobel

 // B = Sobel

-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                   uint8* dst_argb, int width) {

-  asm volatile (

-    "sub       %0,%1                           \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "pslld     $0x18,%%xmm5                    \n"

+void SobelRow_SSE2(const uint8_t* src_sobelx,

+                   const uint8_t* src_sobely,

+                   uint8_t* dst_argb,

+                   int width) {

+  asm volatile(

+      "sub       %0,%1                           \n"

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "pslld     $0x18,%%xmm5                    \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "punpcklbw %%xmm0,%%xmm2                   \n"

-    "punpckhbw %%xmm0,%%xmm0                   \n"

-    "movdqa    %%xmm2,%%xmm1                   \n"

-    "punpcklwd %%xmm2,%%xmm1                   \n"

-    "punpckhwd %%xmm2,%%xmm2                   \n"

-    "por       %%xmm5,%%xmm1                   \n"

-    "por       %%xmm5,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm3                   \n"

-    "punpcklwd %%xmm0,%%xmm3                   \n"

-    "punpckhwd %%xmm0,%%xmm0                   \n"

-    "por       %%xmm5,%%xmm3                   \n"

-    "por       %%xmm5,%%xmm0                   \n"

-    "movdqu    %%xmm1," MEMACCESS(2) "         \n"

-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"

-    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"

-    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"

-    "lea       " MEMLEA(0x40,2) ",%2           \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(width)        // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"

+      "lea       0x10(%0),%0                     \n"

+      "paddusb   %%xmm1,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "punpcklbw %%xmm0,%%xmm2                   \n"

+      "punpckhbw %%xmm0,%%xmm0                   \n"

+      "movdqa    %%xmm2,%%xmm1                   \n"

+      "punpcklwd %%xmm2,%%xmm1                   \n"

+      "punpckhwd %%xmm2,%%xmm2                   \n"

+      "por       %%xmm5,%%xmm1                   \n"

+      "por       %%xmm5,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm3                   \n"

+      "punpcklwd %%xmm0,%%xmm3                   \n"

+      "punpckhwd %%xmm0,%%xmm0                   \n"

+      "por       %%xmm5,%%xmm3                   \n"

+      "por       %%xmm5,%%xmm0                   \n"

+      "movdqu    %%xmm1,(%2)                     \n"

+      "movdqu    %%xmm2,0x10(%2)                 \n"

+      "movdqu    %%xmm3,0x20(%2)                 \n"

+      "movdqu    %%xmm0,0x30(%2)                 \n"

+      "lea       0x40(%2),%2                     \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_argb),    // %2

+        "+r"(width)        // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_SOBELROW_SSE2

 #ifdef HAS_SOBELTOPLANEROW_SSE2

 // Adds Sobel X and Sobel Y and stores Sobel into a plane.

-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                          uint8* dst_y, int width) {

-  asm volatile (

-    "sub       %0,%1                           \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "pslld     $0x18,%%xmm5                    \n"

+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,

+                          const uint8_t* src_sobely,

+                          uint8_t* dst_y,

+                          int width) {

+  asm volatile(

+      "sub       %0,%1                           \n"

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "pslld     $0x18,%%xmm5                    \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_y),       // %2

-    "+r"(width)        // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"

+      "lea       0x10(%0),%0                     \n"

+      "paddusb   %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_y),       // %2

+        "+r"(width)        // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1");

 #endif  // HAS_SOBELTOPLANEROW_SSE2

@@ -4416,47 +5549,48 @@

 // R = Sobel X

 // G = Sobel

 // B = Sobel Y

-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    "sub       %0,%1                           \n"

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

+void SobelXYRow_SSE2(const uint8_t* src_sobelx,

+                     const uint8_t* src_sobely,

+                     uint8_t* dst_argb,

+                     int width) {

+  asm volatile(

+      "sub       %0,%1                           \n"

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    // 8 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "paddusb   %%xmm1,%%xmm2                   \n"

-    "movdqa    %%xmm0,%%xmm3                   \n"

-    "punpcklbw %%xmm5,%%xmm3                   \n"

-    "punpckhbw %%xmm5,%%xmm0                   \n"

-    "movdqa    %%xmm1,%%xmm4                   \n"

-    "punpcklbw %%xmm2,%%xmm4                   \n"

-    "punpckhbw %%xmm2,%%xmm1                   \n"

-    "movdqa    %%xmm4,%%xmm6                   \n"

-    "punpcklwd %%xmm3,%%xmm6                   \n"

-    "punpckhwd %%xmm3,%%xmm4                   \n"

-    "movdqa    %%xmm1,%%xmm7                   \n"

-    "punpcklwd %%xmm0,%%xmm7                   \n"

-    "punpckhwd %%xmm0,%%xmm1                   \n"

-    "movdqu    %%xmm6," MEMACCESS(2) "         \n"

-    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"

-    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"

-    "lea       " MEMLEA(0x40,2) ",%2           \n"

-    "sub       $0x10,%3                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(width)        // %3

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      // 8 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "paddusb   %%xmm1,%%xmm2                   \n"

+      "movdqa    %%xmm0,%%xmm3                   \n"

+      "punpcklbw %%xmm5,%%xmm3                   \n"

+      "punpckhbw %%xmm5,%%xmm0                   \n"

+      "movdqa    %%xmm1,%%xmm4                   \n"

+      "punpcklbw %%xmm2,%%xmm4                   \n"

+      "punpckhbw %%xmm2,%%xmm1                   \n"

+      "movdqa    %%xmm4,%%xmm6                   \n"

+      "punpcklwd %%xmm3,%%xmm6                   \n"

+      "punpckhwd %%xmm3,%%xmm4                   \n"

+      "movdqa    %%xmm1,%%xmm7                   \n"

+      "punpcklwd %%xmm0,%%xmm7                   \n"

+      "punpckhwd %%xmm0,%%xmm1                   \n"

+      "movdqu    %%xmm6,(%2)                     \n"

+      "movdqu    %%xmm4,0x10(%2)                 \n"

+      "movdqu    %%xmm7,0x20(%2)                 \n"

+      "movdqu    %%xmm1,0x30(%2)                 \n"

+      "lea       0x40(%2),%2                     \n"

+      "sub       $0x10,%3                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_argb),    // %2

+        "+r"(width)        // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_SOBELXYROW_SSE2

@@ -4463,211 +5597,212 @@

 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2

 // Creates a table of cumulative sums where each value is a sum of all values

 // above and to the left of the value, inclusive of the value.

-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,

-                                  const int32* previous_cumsum, int width) {

-  asm volatile (

-    "pxor      %%xmm0,%%xmm0                   \n"

-    "pxor      %%xmm1,%%xmm1                   \n"

-    "sub       $0x4,%3                         \n"

-    "jl        49f                             \n"

-    "test      $0xf,%1                         \n"

-    "jne       49f                             \n"

+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,

+                                  int32_t* cumsum,

+                                  const int32_t* previous_cumsum,

+                                  int width) {

+  asm volatile(

+      "pxor      %%xmm0,%%xmm0                   \n"

+      "pxor      %%xmm1,%%xmm1                   \n"

+      "sub       $0x4,%3                         \n"

+      "jl        49f                             \n"

+      "test      $0xf,%1                         \n"

+      "jne       49f                             \n"

-  // 4 pixel loop                              \n"

-    LABELALIGN

-  "40:                                         \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm2,%%xmm4                   \n"

-    "punpcklbw %%xmm1,%%xmm2                   \n"

-    "movdqa    %%xmm2,%%xmm3                   \n"

-    "punpcklwd %%xmm1,%%xmm2                   \n"

-    "punpckhwd %%xmm1,%%xmm3                   \n"

-    "punpckhbw %%xmm1,%%xmm4                   \n"

-    "movdqa    %%xmm4,%%xmm5                   \n"

-    "punpcklwd %%xmm1,%%xmm4                   \n"

-    "punpckhwd %%xmm1,%%xmm5                   \n"

-    "paddd     %%xmm2,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"

-    "paddd     %%xmm0,%%xmm2                   \n"

-    "paddd     %%xmm3,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"

-    "paddd     %%xmm0,%%xmm3                   \n"

-    "paddd     %%xmm4,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"

-    "paddd     %%xmm0,%%xmm4                   \n"

-    "paddd     %%xmm5,%%xmm0                   \n"

-    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"

-    "lea       " MEMLEA(0x40,2) ",%2           \n"

-    "paddd     %%xmm0,%%xmm5                   \n"

-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"

-    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"

-    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x4,%3                         \n"

-    "jge       40b                             \n"

+      // 4 pixel loop.

+      LABELALIGN

+      "40:                                       \n"

+      "movdqu    (%0),%%xmm2                     \n"

+      "lea       0x10(%0),%0                     \n"

+      "movdqa    %%xmm2,%%xmm4                   \n"

+      "punpcklbw %%xmm1,%%xmm2                   \n"

+      "movdqa    %%xmm2,%%xmm3                   \n"

+      "punpcklwd %%xmm1,%%xmm2                   \n"

+      "punpckhwd %%xmm1,%%xmm3                   \n"

+      "punpckhbw %%xmm1,%%xmm4                   \n"

+      "movdqa    %%xmm4,%%xmm5                   \n"

+      "punpcklwd %%xmm1,%%xmm4                   \n"

+      "punpckhwd %%xmm1,%%xmm5                   \n"

+      "paddd     %%xmm2,%%xmm0                   \n"

+      "movdqu    (%2),%%xmm2                     \n"

+      "paddd     %%xmm0,%%xmm2                   \n"

+      "paddd     %%xmm3,%%xmm0                   \n"

+      "movdqu    0x10(%2),%%xmm3                 \n"

+      "paddd     %%xmm0,%%xmm3                   \n"

+      "paddd     %%xmm4,%%xmm0                   \n"

+      "movdqu    0x20(%2),%%xmm4                 \n"

+      "paddd     %%xmm0,%%xmm4                   \n"

+      "paddd     %%xmm5,%%xmm0                   \n"

+      "movdqu    0x30(%2),%%xmm5                 \n"

+      "lea       0x40(%2),%2                     \n"

+      "paddd     %%xmm0,%%xmm5                   \n"

+      "movdqu    %%xmm2,(%1)                     \n"

+      "movdqu    %%xmm3,0x10(%1)                 \n"

+      "movdqu    %%xmm4,0x20(%1)                 \n"

+      "movdqu    %%xmm5,0x30(%1)                 \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x4,%3                         \n"

+      "jge       40b                             \n"

-  "49:                                         \n"

-    "add       $0x3,%3                         \n"

-    "jl        19f                             \n"

+      "49:                                       \n"

+      "add       $0x3,%3                         \n"

+      "jl        19f                             \n"

-  // 1 pixel loop                              \n"

-    LABELALIGN

-  "10:                                         \n"

-    "movd      " MEMACCESS(0) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x4,0) ",%0            \n"

-    "punpcklbw %%xmm1,%%xmm2                   \n"

-    "punpcklwd %%xmm1,%%xmm2                   \n"

-    "paddd     %%xmm2,%%xmm0                   \n"

-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "paddd     %%xmm0,%%xmm2                   \n"

-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x1,%3                         \n"

-    "jge       10b                             \n"

+      // 1 pixel loop.

+      LABELALIGN

+      "10:                                       \n"

+      "movd      (%0),%%xmm2                     \n"

+      "lea       0x4(%0),%0                      \n"

+      "punpcklbw %%xmm1,%%xmm2                   \n"

+      "punpcklwd %%xmm1,%%xmm2                   \n"

+      "paddd     %%xmm2,%%xmm0                   \n"

+      "movdqu    (%2),%%xmm2                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "paddd     %%xmm0,%%xmm2                   \n"

+      "movdqu    %%xmm2,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x1,%3                         \n"

+      "jge       10b                             \n"

-  "19:                                         \n"

-  : "+r"(row),  // %0

-    "+r"(cumsum),  // %1

-    "+r"(previous_cumsum),  // %2

-    "+r"(width)  // %3

-  :

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+      "19:                                       \n"

+      : "+r"(row),              // %0

+        "+r"(cumsum),           // %1

+        "+r"(previous_cumsum),  // %2

+        "+r"(width)             // %3

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2

 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,

-                                    int width, int area, uint8* dst,

+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,

+                                    const int32_t* botleft,

+                                    int width,

+                                    int area,

+                                    uint8_t* dst,

                                     int count) {

-  asm volatile (

-    "movd      %5,%%xmm5                       \n"

-    "cvtdq2ps  %%xmm5,%%xmm5                   \n"

-    "rcpss     %%xmm5,%%xmm4                   \n"

-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"

-    "sub       $0x4,%3                         \n"

-    "jl        49f                             \n"

-    "cmpl      $0x80,%5                        \n"

-    "ja        40f                             \n"

+  asm volatile(

+      "movd      %5,%%xmm5                       \n"

+      "cvtdq2ps  %%xmm5,%%xmm5                   \n"

+      "rcpss     %%xmm5,%%xmm4                   \n"

+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"

+      "sub       $0x4,%3                         \n"

+      "jl        49f                             \n"

+      "cmpl      $0x80,%5                        \n"

+      "ja        40f                             \n"

-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

-    "pcmpeqb   %%xmm6,%%xmm6                   \n"

-    "psrld     $0x10,%%xmm6                    \n"

-    "cvtdq2ps  %%xmm6,%%xmm6                   \n"

-    "addps     %%xmm6,%%xmm5                   \n"

-    "mulps     %%xmm4,%%xmm5                   \n"

-    "cvtps2dq  %%xmm5,%%xmm5                   \n"

-    "packssdw  %%xmm5,%%xmm5                   \n"

+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+      "pcmpeqb   %%xmm6,%%xmm6                   \n"

+      "psrld     $0x10,%%xmm6                    \n"

+      "cvtdq2ps  %%xmm6,%%xmm6                   \n"

+      "addps     %%xmm6,%%xmm5                   \n"

+      "mulps     %%xmm4,%%xmm5                   \n"

+      "cvtps2dq  %%xmm5,%%xmm5                   \n"

+      "packssdw  %%xmm5,%%xmm5                   \n"

-  // 4 pixel small loop                        \n"

-    LABELALIGN

-  "4:                                         \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0

-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1

-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2

-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"

-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"

-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"

-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"

-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0

-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1

-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2

-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "packssdw  %%xmm1,%%xmm0                   \n"

-    "packssdw  %%xmm3,%%xmm2                   \n"

-    "pmulhuw   %%xmm5,%%xmm0                   \n"

-    "pmulhuw   %%xmm5,%%xmm2                   \n"

-    "packuswb  %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jge       4b                              \n"

-    "jmp       49f                             \n"

+      // 4 pixel small loop.

+      LABELALIGN

+      "4:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "psubd     0x00(%0,%4,4),%%xmm0            \n"

+      "psubd     0x10(%0,%4,4),%%xmm1            \n"

+      "psubd     0x20(%0,%4,4),%%xmm2            \n"

+      "psubd     0x30(%0,%4,4),%%xmm3            \n"

+      "lea       0x40(%0),%0                     \n"

+      "psubd     (%1),%%xmm0                     \n"

+      "psubd     0x10(%1),%%xmm1                 \n"

+      "psubd     0x20(%1),%%xmm2                 \n"

+      "psubd     0x30(%1),%%xmm3                 \n"

+      "paddd     0x00(%1,%4,4),%%xmm0            \n"

+      "paddd     0x10(%1,%4,4),%%xmm1            \n"

+      "paddd     0x20(%1,%4,4),%%xmm2            \n"

+      "paddd     0x30(%1,%4,4),%%xmm3            \n"

+      "lea       0x40(%1),%1                     \n"

+      "packssdw  %%xmm1,%%xmm0                   \n"

+      "packssdw  %%xmm3,%%xmm2                   \n"

+      "pmulhuw   %%xmm5,%%xmm0                   \n"

+      "pmulhuw   %%xmm5,%%xmm2                   \n"

+      "packuswb  %%xmm2,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jge       4b                              \n"

+      "jmp       49f                             \n"

-  // 4 pixel loop                              \n"

-    LABELALIGN

-  "40:                                         \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"

-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"

-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0

-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1

-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2

-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"

-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"

-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"

-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"

-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0

-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1

-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2

-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"

-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"

-    "mulps     %%xmm4,%%xmm0                   \n"

-    "mulps     %%xmm4,%%xmm1                   \n"

-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"

-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"

-    "mulps     %%xmm4,%%xmm2                   \n"

-    "mulps     %%xmm4,%%xmm3                   \n"

-    "cvtps2dq  %%xmm0,%%xmm0                   \n"

-    "cvtps2dq  %%xmm1,%%xmm1                   \n"

-    "cvtps2dq  %%xmm2,%%xmm2                   \n"

-    "cvtps2dq  %%xmm3,%%xmm3                   \n"

-    "packssdw  %%xmm1,%%xmm0                   \n"

-    "packssdw  %%xmm3,%%xmm2                   \n"

-    "packuswb  %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jge       40b                             \n"

+      // 4 pixel loop

+      LABELALIGN

+      "40:                                       \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x20(%0),%%xmm2                 \n"

+      "movdqu    0x30(%0),%%xmm3                 \n"

+      "psubd     0x00(%0,%4,4),%%xmm0            \n"

+      "psubd     0x10(%0,%4,4),%%xmm1            \n"

+      "psubd     0x20(%0,%4,4),%%xmm2            \n"

+      "psubd     0x30(%0,%4,4),%%xmm3            \n"

+      "lea       0x40(%0),%0                     \n"

+      "psubd     (%1),%%xmm0                     \n"

+      "psubd     0x10(%1),%%xmm1                 \n"

+      "psubd     0x20(%1),%%xmm2                 \n"

+      "psubd     0x30(%1),%%xmm3                 \n"

+      "paddd     0x00(%1,%4,4),%%xmm0            \n"

+      "paddd     0x10(%1,%4,4),%%xmm1            \n"

+      "paddd     0x20(%1,%4,4),%%xmm2            \n"

+      "paddd     0x30(%1,%4,4),%%xmm3            \n"

+      "lea       0x40(%1),%1                     \n"

+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"

+      "cvtdq2ps  %%xmm1,%%xmm1                   \n"

+      "mulps     %%xmm4,%%xmm0                   \n"

+      "mulps     %%xmm4,%%xmm1                   \n"

+      "cvtdq2ps  %%xmm2,%%xmm2                   \n"

+      "cvtdq2ps  %%xmm3,%%xmm3                   \n"

+      "mulps     %%xmm4,%%xmm2                   \n"

+      "mulps     %%xmm4,%%xmm3                   \n"

+      "cvtps2dq  %%xmm0,%%xmm0                   \n"

+      "cvtps2dq  %%xmm1,%%xmm1                   \n"

+      "cvtps2dq  %%xmm2,%%xmm2                   \n"

+      "cvtps2dq  %%xmm3,%%xmm3                   \n"

+      "packssdw  %%xmm1,%%xmm0                   \n"

+      "packssdw  %%xmm3,%%xmm2                   \n"

+      "packuswb  %%xmm2,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jge       40b                             \n"

-  "49:                                         \n"

-    "add       $0x3,%3                         \n"

-    "jl        19f                             \n"

+      "49:                                       \n"

+      "add       $0x3,%3                         \n"

+      "jl        19f                             \n"

-  // 1 pixel loop                              \n"

-    LABELALIGN

-  "10:                                         \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"

-    "mulps     %%xmm4,%%xmm0                   \n"

-    "cvtps2dq  %%xmm0,%%xmm0                   \n"

-    "packssdw  %%xmm0,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movd      %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x4,2) ",%2            \n"

-    "sub       $0x1,%3                         \n"

-    "jge       10b                             \n"

-  "19:                                         \n"

-  : "+r"(topleft),  // %0

-    "+r"(botleft),  // %1

-    "+r"(dst),      // %2

-    "+rm"(count)    // %3

-  : "r"((intptr_t)(width)),  // %4

-    "rm"(area)     // %5

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+      // 1 pixel loop

+      LABELALIGN

+      "10:                                       \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "psubd     0x00(%0,%4,4),%%xmm0            \n"

+      "lea       0x10(%0),%0                     \n"

+      "psubd     (%1),%%xmm0                     \n"

+      "paddd     0x00(%1,%4,4),%%xmm0            \n"

+      "lea       0x10(%1),%1                     \n"

+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"

+      "mulps     %%xmm4,%%xmm0                   \n"

+      "cvtps2dq  %%xmm0,%%xmm0                   \n"

+      "packssdw  %%xmm0,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movd      %%xmm0,(%2)                     \n"

+      "lea       0x4(%2),%2                      \n"

+      "sub       $0x1,%3                         \n"

+      "jge       10b                             \n"

+      "19:                                       \n"

+      : "+r"(topleft),           // %0

+        "+r"(botleft),           // %1

+        "+r"(dst),               // %2

+        "+rm"(count)             // %3

+      : "r"((intptr_t)(width)),  // %4

+        "rm"(area)               // %5

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2

@@ -4674,746 +5809,863 @@

 #ifdef HAS_ARGBAFFINEROW_SSE2

 // Copy ARGB pixels from source image with slope to a row of destination.

 LIBYUV_API

-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

-                        uint8* dst_argb, const float* src_dudv, int width) {

+void ARGBAffineRow_SSE2(const uint8_t* src_argb,

+                        int src_argb_stride,

+                        uint8_t* dst_argb,

+                        const float* src_dudv,

+                        int width) {

   intptr_t src_argb_stride_temp = src_argb_stride;

   intptr_t temp;

-  asm volatile (

-    "movq      " MEMACCESS(3) ",%%xmm2         \n"

-    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"

-    "shl       $0x10,%1                        \n"

-    "add       $0x4,%1                         \n"

-    "movd      %1,%%xmm5                       \n"

-    "sub       $0x4,%4                         \n"

-    "jl        49f                             \n"

+  asm volatile(

+      "movq      (%3),%%xmm2                     \n"

+      "movq      0x08(%3),%%xmm7                 \n"

+      "shl       $0x10,%1                        \n"

+      "add       $0x4,%1                         \n"

+      "movd      %1,%%xmm5                       \n"

+      "sub       $0x4,%4                         \n"

+      "jl        49f                             \n"

-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"

-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

-    "movdqa    %%xmm2,%%xmm0                   \n"

-    "addps     %%xmm7,%%xmm0                   \n"

-    "movlhps   %%xmm0,%%xmm2                   \n"

-    "movdqa    %%xmm7,%%xmm4                   \n"

-    "addps     %%xmm4,%%xmm4                   \n"

-    "movdqa    %%xmm2,%%xmm3                   \n"

-    "addps     %%xmm4,%%xmm3                   \n"

-    "addps     %%xmm4,%%xmm4                   \n"

+      "pshufd    $0x44,%%xmm7,%%xmm7             \n"

+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+      "movdqa    %%xmm2,%%xmm0                   \n"

+      "addps     %%xmm7,%%xmm0                   \n"

+      "movlhps   %%xmm0,%%xmm2                   \n"

+      "movdqa    %%xmm7,%%xmm4                   \n"

+      "addps     %%xmm4,%%xmm4                   \n"

+      "movdqa    %%xmm2,%%xmm3                   \n"

+      "addps     %%xmm4,%%xmm3                   \n"

+      "addps     %%xmm4,%%xmm4                   \n"

-  // 4 pixel loop                              \n"

-    LABELALIGN

-  "40:                                         \n"

-    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2

-    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2

-    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts

-    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride

-    "movd      %%xmm0,%k1                      \n"

-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

-    "movd      %%xmm0,%k5                      \n"

-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1

-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6

-    "punpckldq %%xmm6,%%xmm1                   \n"

-    "addps     %%xmm4,%%xmm2                   \n"

-    "movq      %%xmm1," MEMACCESS(2) "         \n"

-    "movd      %%xmm0,%k1                      \n"

-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

-    "movd      %%xmm0,%k5                      \n"

-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0

-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6

-    "punpckldq %%xmm6,%%xmm0                   \n"

-    "addps     %%xmm4,%%xmm3                   \n"

-    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%4                         \n"

-    "jge       40b                             \n"

+      // 4 pixel loop

+      LABELALIGN

+      "40:                                       \n"

+      "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2

+      "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2

+      "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts

+      "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride

+      "movd      %%xmm0,%k1                      \n"

+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+      "movd      %%xmm0,%k5                      \n"

+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+      "movd      0x00(%0,%1,1),%%xmm1            \n"

+      "movd      0x00(%0,%5,1),%%xmm6            \n"

+      "punpckldq %%xmm6,%%xmm1                   \n"

+      "addps     %%xmm4,%%xmm2                   \n"

+      "movq      %%xmm1,(%2)                     \n"

+      "movd      %%xmm0,%k1                      \n"

+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+      "movd      %%xmm0,%k5                      \n"

+      "movd      0x00(%0,%1,1),%%xmm0            \n"

+      "movd      0x00(%0,%5,1),%%xmm6            \n"

+      "punpckldq %%xmm6,%%xmm0                   \n"

+      "addps     %%xmm4,%%xmm3                   \n"

+      "movq      %%xmm0,0x08(%2)                 \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%4                         \n"

+      "jge       40b                             \n"

-  "49:                                         \n"

-    "add       $0x3,%4                         \n"

-    "jl        19f                             \n"

+      "49:                                       \n"

+      "add       $0x3,%4                         \n"

+      "jl        19f                             \n"

-  // 1 pixel loop                              \n"

-    LABELALIGN

-  "10:                                         \n"

-    "cvttps2dq %%xmm2,%%xmm0                   \n"

-    "packssdw  %%xmm0,%%xmm0                   \n"

-    "pmaddwd   %%xmm5,%%xmm0                   \n"

-    "addps     %%xmm7,%%xmm2                   \n"

-    "movd      %%xmm0,%k1                      \n"

-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0

-    "movd      %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x04,2) ",%2           \n"

-    "sub       $0x1,%4                         \n"

-    "jge       10b                             \n"

-  "19:                                         \n"

-  : "+r"(src_argb),  // %0

-    "+r"(src_argb_stride_temp),  // %1

-    "+r"(dst_argb),  // %2

-    "+r"(src_dudv),  // %3

-    "+rm"(width),    // %4

-    "=&r"(temp)      // %5

-  :

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      // 1 pixel loop

+      LABELALIGN

+      "10:                                       \n"

+      "cvttps2dq %%xmm2,%%xmm0                   \n"

+      "packssdw  %%xmm0,%%xmm0                   \n"

+      "pmaddwd   %%xmm5,%%xmm0                   \n"

+      "addps     %%xmm7,%%xmm2                   \n"

+      "movd      %%xmm0,%k1                      \n"

+      "movd      0x00(%0,%1,1),%%xmm0            \n"

+      "movd      %%xmm0,(%2)                     \n"

+      "lea       0x04(%2),%2                     \n"

+      "sub       $0x1,%4                         \n"

+      "jge       10b                             \n"

+      "19:                                       \n"

+      : "+r"(src_argb),              // %0

+        "+r"(src_argb_stride_temp),  // %1

+        "+r"(dst_argb),              // %2

+        "+r"(src_dudv),              // %3

+        "+rm"(width),                // %4

+        "=&r"(temp)                  // %5

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBAFFINEROW_SSE2

 #ifdef HAS_INTERPOLATEROW_SSSE3

 // Bilinear filter 16x2 -> 16x1

-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                          ptrdiff_t src_stride, int dst_width,

+void InterpolateRow_SSSE3(uint8_t* dst_ptr,

+                          const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          int dst_width,

                           int source_y_fraction) {

-  asm volatile (

-    "sub       %1,%0                           \n"

-    "cmp       $0x0,%3                         \n"

-    "je        100f                            \n"

-    "cmp       $0x80,%3                        \n"

-    "je        50f                             \n"

+  asm volatile(

+      "sub       %1,%0                           \n"

+      "cmp       $0x0,%3                         \n"

+      "je        100f                            \n"

+      "cmp       $0x80,%3                        \n"

+      "je        50f                             \n"

-    "movd      %3,%%xmm0                       \n"

-    "neg       %3                              \n"

-    "add       $0x100,%3                       \n"

-    "movd      %3,%%xmm5                       \n"

-    "punpcklbw %%xmm0,%%xmm5                   \n"

-    "punpcklwd %%xmm5,%%xmm5                   \n"

-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"

-    "mov       $0x80808080,%%eax               \n"

-    "movd      %%eax,%%xmm4                    \n"

-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"

+      "movd      %3,%%xmm0                       \n"

+      "neg       %3                              \n"

+      "add       $0x100,%3                       \n"

+      "movd      %3,%%xmm5                       \n"

+      "punpcklbw %%xmm0,%%xmm5                   \n"

+      "punpcklwd %%xmm5,%%xmm5                   \n"

+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"

+      "mov       $0x80808080,%%eax               \n"

+      "movd      %%eax,%%xmm4                    \n"

+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"

-    // General purpose row blend.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)

-    "movdqa     %%xmm0,%%xmm1                  \n"

-    "punpcklbw  %%xmm2,%%xmm0                  \n"

-    "punpckhbw  %%xmm2,%%xmm1                  \n"

-    "psubb      %%xmm4,%%xmm0                  \n"

-    "psubb      %%xmm4,%%xmm1                  \n"

-    "movdqa     %%xmm5,%%xmm2                  \n"

-    "movdqa     %%xmm5,%%xmm3                  \n"

-    "pmaddubsw  %%xmm0,%%xmm2                  \n"

-    "pmaddubsw  %%xmm1,%%xmm3                  \n"

-    "paddw      %%xmm4,%%xmm2                  \n"

-    "paddw      %%xmm4,%%xmm3                  \n"

-    "psrlw      $0x8,%%xmm2                    \n"

-    "psrlw      $0x8,%%xmm3                    \n"

-    "packuswb   %%xmm3,%%xmm2                  \n"

-    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-    "jmp       99f                             \n"

+      // General purpose row blend.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%1),%%xmm0                     \n"

+      "movdqu    0x00(%1,%4,1),%%xmm2            \n"

+      "movdqa     %%xmm0,%%xmm1                  \n"

+      "punpcklbw  %%xmm2,%%xmm0                  \n"

+      "punpckhbw  %%xmm2,%%xmm1                  \n"

+      "psubb      %%xmm4,%%xmm0                  \n"

+      "psubb      %%xmm4,%%xmm1                  \n"

+      "movdqa     %%xmm5,%%xmm2                  \n"

+      "movdqa     %%xmm5,%%xmm3                  \n"

+      "pmaddubsw  %%xmm0,%%xmm2                  \n"

+      "pmaddubsw  %%xmm1,%%xmm3                  \n"

+      "paddw      %%xmm4,%%xmm2                  \n"

+      "paddw      %%xmm4,%%xmm3                  \n"

+      "psrlw      $0x8,%%xmm2                    \n"

+      "psrlw      $0x8,%%xmm3                    \n"

+      "packuswb   %%xmm3,%%xmm2                  \n"

+      "movdqu    %%xmm2,0x00(%1,%0,1)            \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      "jmp       99f                             \n"

-    // Blend 50 / 50.

-    LABELALIGN

-  "50:                                         \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        50b                             \n"

-    "jmp       99f                             \n"

+      // Blend 50 / 50.

+      LABELALIGN

+      "50:                                       \n"

+      "movdqu    (%1),%%xmm0                     \n"

+      "movdqu    0x00(%1,%4,1),%%xmm1            \n"

+      "pavgb     %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        50b                             \n"

+      "jmp       99f                             \n"

-    // Blend 100 / 0 - Copy row unchanged.

-    LABELALIGN

-  "100:                                        \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        100b                            \n"

+      // Blend 100 / 0 - Copy row unchanged.

+      LABELALIGN

+      "100:                                      \n"

+      "movdqu    (%1),%%xmm0                     \n"

+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        100b                            \n"

-  "99:                                         \n"

-  : "+r"(dst_ptr),     // %0

-    "+r"(src_ptr),     // %1

-    "+rm"(dst_width),  // %2

-    "+r"(source_y_fraction)  // %3

-  : "r"((intptr_t)(src_stride))  // %4

-  : "memory", "cc", "eax", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+      "99:                                       \n"

+      : "+r"(dst_ptr),               // %0

+        "+r"(src_ptr),               // %1

+        "+rm"(dst_width),            // %2

+        "+r"(source_y_fraction)      // %3

+      : "r"((intptr_t)(src_stride))  // %4

+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_INTERPOLATEROW_SSSE3

 #ifdef HAS_INTERPOLATEROW_AVX2

 // Bilinear filter 32x2 -> 32x1

-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,

-                         ptrdiff_t src_stride, int dst_width,

+void InterpolateRow_AVX2(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         int dst_width,

                          int source_y_fraction) {

-  asm volatile (

-    "cmp       $0x0,%3                         \n"

-    "je        100f                            \n"

-    "sub       %1,%0                           \n"

-    "cmp       $0x80,%3                        \n"

-    "je        50f                             \n"

+  asm volatile(

+      "cmp       $0x0,%3                         \n"

+      "je        100f                            \n"

+      "sub       %1,%0                           \n"

+      "cmp       $0x80,%3                        \n"

+      "je        50f                             \n"

-    "vmovd      %3,%%xmm0                      \n"

-    "neg        %3                             \n"

-    "add        $0x100,%3                      \n"

-    "vmovd      %3,%%xmm5                      \n"

-    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"

-    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"

-    "vbroadcastss %%xmm5,%%ymm5                \n"

-    "mov        $0x80808080,%%eax              \n"

-    "vmovd      %%eax,%%xmm4                   \n"

-    "vbroadcastss %%xmm4,%%ymm4                \n"

+      "vmovd      %3,%%xmm0                      \n"

+      "neg        %3                             \n"

+      "add        $0x100,%3                      \n"

+      "vmovd      %3,%%xmm5                      \n"

+      "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"

+      "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"

+      "vbroadcastss %%xmm5,%%ymm5                \n"

+      "mov        $0x80808080,%%eax              \n"

+      "vmovd      %%eax,%%xmm4                   \n"

+      "vbroadcastss %%xmm4,%%ymm4                \n"

-    // General purpose row blend.

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"

-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)

-    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"

-    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"

-    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"

-    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"

-    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"

-    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

-    "jmp       99f                             \n"

+      // General purpose row blend.

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%1),%%ymm0                    \n"

+      "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"

+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"

+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"

+      "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"

+      "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"

+      "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"

+      "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"

+      "lea        0x20(%1),%1                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "jmp        99f                            \n"

-    // Blend 50 / 50.

-    LABELALIGN

-  "50:                                         \n"

-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"

-    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0

-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        50b                             \n"

-    "jmp       99f                             \n"

+      // Blend 50 / 50.

+      LABELALIGN

+      "50:                                       \n"

+      "vmovdqu   (%1),%%ymm0                     \n"

+      "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"

+      "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        50b                             \n"

+      "jmp       99f                             \n"

-    // Blend 100 / 0 - Copy row unchanged.

-    LABELALIGN

-  "100:                                        \n"

-    "rep movsb " MEMMOVESTRING(1,0) "          \n"

-    "jmp       999f                            \n"

+      // Blend 100 / 0 - Copy row unchanged.

+      LABELALIGN

+      "100:                                      \n"

+      "rep movsb                                 \n"

+      "jmp       999f                            \n"

-  "99:                                         \n"

-    "vzeroupper                                \n"

-  "999:                                        \n"

-  : "+D"(dst_ptr),    // %0

-    "+S"(src_ptr),    // %1

-    "+cm"(dst_width),  // %2

-    "+r"(source_y_fraction)  // %3

-  : "r"((intptr_t)(src_stride))  // %4

-  : "memory", "cc", "eax", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"

-  );

+      "99:                                       \n"

+      "vzeroupper                                \n"

+      "999:                                      \n"

+      : "+D"(dst_ptr),               // %0

+        "+S"(src_ptr),               // %1

+        "+cm"(dst_width),            // %2

+        "+r"(source_y_fraction)      // %3

+      : "r"((intptr_t)(src_stride))  // %4

+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");

 #endif  // HAS_INTERPOLATEROW_AVX2

 #ifdef HAS_ARGBSHUFFLEROW_SSSE3

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                          const uint8* shuffler, int width) {

-  asm volatile (

-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pshufb    %%xmm5,%%xmm0                   \n"

-    "pshufb    %%xmm5,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)        // %2

-  : "r"(shuffler)    // %3

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm5"

-  );

+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          const uint8_t* shuffler,

+                          int width) {

+  asm volatile(

+      "movdqu    (%3),%%xmm5                     \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "pshufb    %%xmm5,%%xmm0                   \n"

+      "pshufb    %%xmm5,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "r"(shuffler)    // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

 #endif  // HAS_ARGBSHUFFLEROW_SSSE3

 #ifdef HAS_ARGBSHUFFLEROW_AVX2

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width) {

-  asm volatile (

-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"

-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"

-    "lea       " MEMLEA(0x40,0) ",%0           \n"

-    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"

-    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"

-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"

-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)        // %2

-  : "r"(shuffler)    // %3

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm5"

-  );

+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,

+                         uint8_t* dst_argb,

+                         const uint8_t* shuffler,

+                         int width) {

+  asm volatile(

+      "vbroadcastf128 (%3),%%ymm5                \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu   (%0),%%ymm0                     \n"

+      "vmovdqu   0x20(%0),%%ymm1                 \n"

+      "lea       0x40(%0),%0                     \n"

+      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"

+      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"

+      "vmovdqu   %%ymm0,(%1)                     \n"

+      "vmovdqu   %%ymm1,0x20(%1)                 \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "r"(shuffler)    // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm5");

 #endif  // HAS_ARGBSHUFFLEROW_AVX2

-#ifdef HAS_ARGBSHUFFLEROW_SSE2

-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width) {

-  uintptr_t pixel_temp;

-  asm volatile (

-    "pxor      %%xmm5,%%xmm5                   \n"

-    "mov       " MEMACCESS(4) ",%k2            \n"

-    "cmp       $0x3000102,%k2                  \n"

-    "je        3012f                           \n"

-    "cmp       $0x10203,%k2                    \n"

-    "je        123f                            \n"

-    "cmp       $0x30201,%k2                    \n"

-    "je        321f                            \n"

-    "cmp       $0x2010003,%k2                  \n"

-    "je        2103f                           \n"

+#ifdef HAS_I422TOYUY2ROW_SSE2

+void I422ToYUY2Row_SSE2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_yuy2,

+                        int width) {

+  asm volatile(

-    LABELALIGN

-  "1:                                          \n"

-    "movzb     " MEMACCESS(4) ",%2             \n"

-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

-    "mov       %b2," MEMACCESS(1) "            \n"

-    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"

-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

-    "mov       %b2," MEMACCESS2(0x1,1) "       \n"

-    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"

-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

-    "mov       %b2," MEMACCESS2(0x2,1) "       \n"

-    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"

-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2

-    "mov       %b2," MEMACCESS2(0x3,1) "       \n"

-    "lea       " MEMLEA(0x4,0) ",%0            \n"

-    "lea       " MEMLEA(0x4,1) ",%1            \n"

-    "sub       $0x1,%3                         \n"

-    "jg        1b                              \n"

-    "jmp       99f                             \n"

+      "sub       %1,%2                             \n"

-    LABELALIGN

-  "123:                                        \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "punpckhbw %%xmm5,%%xmm1                   \n"

-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"

-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"

-    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"

-    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        123b                            \n"

-    "jmp       99f                             \n"

+      LABELALIGN

+      "1:                                          \n"

+      "movq      (%1),%%xmm2                       \n"

+      "movq      0x00(%1,%2,1),%%xmm1              \n"

+      "add       $0x8,%1                           \n"

+      "punpcklbw %%xmm1,%%xmm2                     \n"

+      "movdqu    (%0),%%xmm0                       \n"

+      "add       $0x10,%0                          \n"

+      "movdqa    %%xmm0,%%xmm1                     \n"

+      "punpcklbw %%xmm2,%%xmm0                     \n"

+      "punpckhbw %%xmm2,%%xmm1                     \n"

+      "movdqu    %%xmm0,(%3)                       \n"

+      "movdqu    %%xmm1,0x10(%3)                   \n"

+      "lea       0x20(%3),%3                       \n"

+      "sub       $0x10,%4                          \n"

+      "jg         1b                               \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_yuy2),  // %3

+        "+rm"(width)     // %4

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

+}

+#endif  // HAS_I422TOYUY2ROW_SSE2

-    LABELALIGN

-  "321:                                        \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "punpckhbw %%xmm5,%%xmm1                   \n"

-    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"

-    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"

-    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"

-    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        321b                            \n"

-    "jmp       99f                             \n"

+#ifdef HAS_I422TOUYVYROW_SSE2

+void I422ToUYVYRow_SSE2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_uyvy,

+                        int width) {

+  asm volatile(

-    LABELALIGN

-  "2103:                                       \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "punpckhbw %%xmm5,%%xmm1                   \n"

-    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"

-    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"

-    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"

-    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        2103b                           \n"

-    "jmp       99f                             \n"

+      "sub        %1,%2                            \n"

-    LABELALIGN

-  "3012:                                       \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "punpckhbw %%xmm5,%%xmm1                   \n"

-    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"

-    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"

-    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"

-    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        3012b                           \n"

-  "99:                                         \n"

-  : "+r"(src_argb),     // %0

-    "+r"(dst_argb),     // %1

-    "=&d"(pixel_temp),  // %2

-    "+r"(width)         // %3

-  : "r"(shuffler)       // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm5"

-  );

+      LABELALIGN

+      "1:                                          \n"

+      "movq      (%1),%%xmm2                       \n"

+      "movq      0x00(%1,%2,1),%%xmm1              \n"

+      "add       $0x8,%1                           \n"

+      "punpcklbw %%xmm1,%%xmm2                     \n"

+      "movdqu    (%0),%%xmm0                       \n"

+      "movdqa    %%xmm2,%%xmm1                     \n"

+      "add       $0x10,%0                          \n"

+      "punpcklbw %%xmm0,%%xmm1                     \n"

+      "punpckhbw %%xmm0,%%xmm2                     \n"

+      "movdqu    %%xmm1,(%3)                       \n"

+      "movdqu    %%xmm2,0x10(%3)                   \n"

+      "lea       0x20(%3),%3                       \n"

+      "sub       $0x10,%4                          \n"

+      "jg         1b                               \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_uyvy),  // %3

+        "+rm"(width)     // %4

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

-#endif  // HAS_ARGBSHUFFLEROW_SSE2

+#endif  // HAS_I422TOUYVYROW_SSE2

-#ifdef HAS_I422TOYUY2ROW_SSE2

-void I422ToYUY2Row_SSE2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_frame, int width) {

- asm volatile (

-    "sub       %1,%2                             \n"

-    LABELALIGN

-  "1:                                            \n"

-    "movq      " MEMACCESS(1) ",%%xmm2           \n"

-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3

-    "lea       " MEMLEA(0x8,1) ",%1              \n"

-    "punpcklbw %%xmm3,%%xmm2                     \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"

-    "lea       " MEMLEA(0x10,0) ",%0             \n"

-    "movdqa    %%xmm0,%%xmm1                     \n"

-    "punpcklbw %%xmm2,%%xmm0                     \n"

-    "punpckhbw %%xmm2,%%xmm1                     \n"

-    "movdqu    %%xmm0," MEMACCESS(3) "           \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"

-    "lea       " MEMLEA(0x20,3) ",%3             \n"

-    "sub       $0x10,%4                          \n"

-    "jg         1b                               \n"

-    : "+r"(src_y),  // %0

-      "+r"(src_u),  // %1

-      "+r"(src_v),  // %2

-      "+r"(dst_frame),  // %3

-      "+rm"(width)  // %4

-    :

-    : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3"

-  );

+#ifdef HAS_I422TOYUY2ROW_AVX2

+void I422ToYUY2Row_AVX2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_yuy2,

+                        int width) {

+  asm volatile(

+      "sub       %1,%2                             \n"

+      LABELALIGN

+      "1:                                          \n"

+      "vpmovzxbw  (%1),%%ymm1                      \n"

+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"

+      "add        $0x10,%1                         \n"

+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"

+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"

+      "vmovdqu    (%0),%%ymm0                      \n"

+      "add        $0x20,%0                         \n"

+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"

+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"

+      "vextractf128 $0x0,%%ymm1,(%3)               \n"

+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"

+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"

+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"

+      "lea        0x40(%3),%3                      \n"

+      "sub        $0x20,%4                         \n"

+      "jg         1b                               \n"

+      "vzeroupper                                  \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_yuy2),  // %3

+        "+rm"(width)     // %4

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

-#endif  // HAS_I422TOYUY2ROW_SSE2

+#endif  // HAS_I422TOYUY2ROW_AVX2

-#ifdef HAS_I422TOUYVYROW_SSE2

-void I422ToUYVYRow_SSE2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_frame, int width) {

- asm volatile (

-    "sub        %1,%2                            \n"

-    LABELALIGN

-  "1:                                            \n"

-    "movq      " MEMACCESS(1) ",%%xmm2           \n"

-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3

-    "lea       " MEMLEA(0x8,1) ",%1              \n"

-    "punpcklbw %%xmm3,%%xmm2                     \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"

-    "movdqa    %%xmm2,%%xmm1                     \n"

-    "lea       " MEMLEA(0x10,0) ",%0             \n"

-    "punpcklbw %%xmm0,%%xmm1                     \n"

-    "punpckhbw %%xmm0,%%xmm2                     \n"

-    "movdqu    %%xmm1," MEMACCESS(3) "           \n"

-    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"

-    "lea       " MEMLEA(0x20,3) ",%3             \n"

-    "sub       $0x10,%4                          \n"

-    "jg         1b                               \n"

-    : "+r"(src_y),  // %0

-      "+r"(src_u),  // %1

-      "+r"(src_v),  // %2

-      "+r"(dst_frame),  // %3

-      "+rm"(width)  // %4

-    :

-    : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3"

-  );

+#ifdef HAS_I422TOUYVYROW_AVX2

+void I422ToUYVYRow_AVX2(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_uyvy,

+                        int width) {

+  asm volatile(

+      "sub        %1,%2                            \n"

+      LABELALIGN

+      "1:                                          \n"

+      "vpmovzxbw  (%1),%%ymm1                      \n"

+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"

+      "add        $0x10,%1                         \n"

+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"

+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"

+      "vmovdqu    (%0),%%ymm0                      \n"

+      "add        $0x20,%0                         \n"

+      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"

+      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"

+      "vextractf128 $0x0,%%ymm1,(%3)               \n"

+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"

+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"

+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"

+      "lea        0x40(%3),%3                      \n"

+      "sub        $0x20,%4                         \n"

+      "jg         1b                               \n"

+      "vzeroupper                                  \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_uyvy),  // %3

+        "+rm"(width)     // %4

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2");

-#endif  // HAS_I422TOUYVYROW_SSE2

+#endif  // HAS_I422TOUYVYROW_AVX2

 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2

-void ARGBPolynomialRow_SSE2(const uint8* src_argb,

-                            uint8* dst_argb, const float* poly,

+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            const float* poly,

                             int width) {

-  asm volatile (

-    "pxor      %%xmm3,%%xmm3                   \n"

+  asm volatile(

-    // 2 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movq      " MEMACCESS(0) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "punpcklbw %%xmm3,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm4                   \n"

-    "punpcklwd %%xmm3,%%xmm0                   \n"

-    "punpckhwd %%xmm3,%%xmm4                   \n"

-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"

-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "movdqa    %%xmm4,%%xmm5                   \n"

-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"

-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"

-    "addps     " MEMACCESS(3) ",%%xmm0         \n"

-    "addps     " MEMACCESS(3) ",%%xmm4         \n"

-    "movdqa    %%xmm1,%%xmm2                   \n"

-    "movdqa    %%xmm5,%%xmm6                   \n"

-    "mulps     %%xmm1,%%xmm2                   \n"

-    "mulps     %%xmm5,%%xmm6                   \n"

-    "mulps     %%xmm2,%%xmm1                   \n"

-    "mulps     %%xmm6,%%xmm5                   \n"

-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"

-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"

-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"

-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"

-    "addps     %%xmm2,%%xmm0                   \n"

-    "addps     %%xmm6,%%xmm4                   \n"

-    "addps     %%xmm1,%%xmm0                   \n"

-    "addps     %%xmm5,%%xmm4                   \n"

-    "cvttps2dq %%xmm0,%%xmm0                   \n"

-    "cvttps2dq %%xmm4,%%xmm4                   \n"

-    "packuswb  %%xmm4,%%xmm0                   \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x2,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)      // %2

-  : "r"(poly)        // %3

-  : "memory", "cc"

-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+      "pxor      %%xmm3,%%xmm3                   \n"

+      // 2 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movq      (%0),%%xmm0                     \n"

+      "lea       0x8(%0),%0                      \n"

+      "punpcklbw %%xmm3,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm4                   \n"

+      "punpcklwd %%xmm3,%%xmm0                   \n"

+      "punpckhwd %%xmm3,%%xmm4                   \n"

+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"

+      "cvtdq2ps  %%xmm4,%%xmm4                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "movdqa    %%xmm4,%%xmm5                   \n"

+      "mulps     0x10(%3),%%xmm0                 \n"

+      "mulps     0x10(%3),%%xmm4                 \n"

+      "addps     (%3),%%xmm0                     \n"

+      "addps     (%3),%%xmm4                     \n"

+      "movdqa    %%xmm1,%%xmm2                   \n"

+      "movdqa    %%xmm5,%%xmm6                   \n"

+      "mulps     %%xmm1,%%xmm2                   \n"

+      "mulps     %%xmm5,%%xmm6                   \n"

+      "mulps     %%xmm2,%%xmm1                   \n"

+      "mulps     %%xmm6,%%xmm5                   \n"

+      "mulps     0x20(%3),%%xmm2                 \n"

+      "mulps     0x20(%3),%%xmm6                 \n"

+      "mulps     0x30(%3),%%xmm1                 \n"

+      "mulps     0x30(%3),%%xmm5                 \n"

+      "addps     %%xmm2,%%xmm0                   \n"

+      "addps     %%xmm6,%%xmm4                   \n"

+      "addps     %%xmm1,%%xmm0                   \n"

+      "addps     %%xmm5,%%xmm4                   \n"

+      "cvttps2dq %%xmm0,%%xmm0                   \n"

+      "cvttps2dq %%xmm4,%%xmm4                   \n"

+      "packuswb  %%xmm4,%%xmm0                   \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x2,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "r"(poly)        // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2

 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2

-void ARGBPolynomialRow_AVX2(const uint8* src_argb,

-                            uint8* dst_argb, const float* poly,

+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            const float* poly,

                             int width) {

-  asm volatile (

-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"

-    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"

-    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"

-    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"

+  asm volatile(

+      "vbroadcastf128 (%3),%%ymm4                \n"

+      "vbroadcastf128 0x10(%3),%%ymm5            \n"

+      "vbroadcastf128 0x20(%3),%%ymm6            \n"

+      "vbroadcastf128 0x30(%3),%%ymm7            \n"

-    // 2 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels

-    "lea         " MEMLEA(0x8,0) ",%0          \n"

-    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats

-    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X

-    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X

-    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X

-    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X

-    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X

-    "vcvttps2dq  %%ymm0,%%ymm0                 \n"

-    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"

-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"

-    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"

-    "vmovq       %%xmm0," MEMACCESS(1) "       \n"

-    "lea         " MEMLEA(0x8,1) ",%1          \n"

-    "sub         $0x2,%2                       \n"

-    "jg          1b                            \n"

-    "vzeroupper                                \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)      // %2

-  : "r"(poly)        // %3

-  : "memory", "cc",

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      // 2 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels

+      "lea         0x8(%0),%0                    \n"

+      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats

+      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X

+      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X

+      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X

+      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X

+      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *

+                                                      // X

+      "vcvttps2dq  %%ymm0,%%ymm0                 \n"

+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"

+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"

+      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"

+      "vmovq       %%xmm0,(%1)                   \n"

+      "lea         0x8(%1),%1                    \n"

+      "sub         $0x2,%2                       \n"

+      "jg          1b                            \n"

+      "vzeroupper                                \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "r"(poly)        // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2

+#ifdef HAS_HALFFLOATROW_SSE2

+static float kScaleBias = 1.9259299444e-34f;

+void HalfFloatRow_SSE2(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width) {

+  scale *= kScaleBias;

+  asm volatile(

+      "movd        %3,%%xmm4                     \n"

+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"

+      "pxor        %%xmm5,%%xmm5                 \n"

+      "sub         %0,%1                         \n"

+      // 16 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts

+      "add         $0x10,%0                      \n"

+      "movdqa      %%xmm2,%%xmm3                 \n"

+      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1

+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats

+      "punpckhwd   %%xmm5,%%xmm3                 \n"

+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"

+      "mulps       %%xmm4,%%xmm2                 \n"

+      "mulps       %%xmm4,%%xmm3                 \n"

+      "psrld       $0xd,%%xmm2                   \n"

+      "psrld       $0xd,%%xmm3                   \n"

+      "packssdw    %%xmm3,%%xmm2                 \n"

+      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"

+      "sub         $0x8,%2                       \n"

+      "jg          1b                            \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      : "m"(scale)   // %3

+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");

+}

+#endif  // HAS_HALFFLOATROW_SSE2

+#ifdef HAS_HALFFLOATROW_AVX2

+void HalfFloatRow_AVX2(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width) {

+  scale *= kScaleBias;

+  asm volatile(

+      "vbroadcastss  %3, %%ymm4                  \n"

+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+      "sub        %0,%1                          \n"

+      // 16 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts

+      "add        $0x20,%0                       \n"

+      "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates

+      "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"

+      "vcvtdq2ps  %%ymm3,%%ymm3                  \n"

+      "vcvtdq2ps  %%ymm2,%%ymm2                  \n"

+      "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"

+      "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"

+      "vpsrld     $0xd,%%ymm3,%%ymm3             \n"

+      "vpsrld     $0xd,%%ymm2,%%ymm2             \n"

+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates

+      "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"

+      "sub        $0x10,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+#if defined(__x86_64__)

+      : "x"(scale)  // %3

+#else

+      : "m"(scale)  // %3

+#endif

+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");

+}

+#endif  // HAS_HALFFLOATROW_AVX2

+#ifdef HAS_HALFFLOATROW_F16C

+void HalfFloatRow_F16C(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width) {

+  asm volatile(

+      "vbroadcastss  %3, %%ymm4                  \n"

+      "sub        %0,%1                          \n"

+      // 16 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints

+      "vpmovzxwd   0x10(%0),%%ymm3               \n"

+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"

+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"

+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"

+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"

+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"

+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"

+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"

+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"

+      "add         $0x20,%0                      \n"

+      "sub         $0x10,%2                      \n"

+      "jg          1b                            \n"

+      "vzeroupper                                \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+#if defined(__x86_64__)

+      : "x"(scale)  // %3

+#else

+      : "m"(scale)  // %3

+#endif

+      : "memory", "cc", "xmm2", "xmm3", "xmm4");

+}

+#endif  // HAS_HALFFLOATROW_F16C

+#ifdef HAS_HALFFLOATROW_F16C

+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {

+  asm volatile(

+      "sub        %0,%1                          \n"

+      // 16 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints

+      "vpmovzxwd   0x10(%0),%%ymm3               \n"

+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"

+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"

+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"

+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"

+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"

+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"

+      "add         $0x20,%0                      \n"

+      "sub         $0x10,%2                      \n"

+      "jg          1b                            \n"

+      "vzeroupper                                \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "memory", "cc", "xmm2", "xmm3");

+}

+#endif  // HAS_HALFFLOATROW_F16C

 #ifdef HAS_ARGBCOLORTABLEROW_X86

 // Tranform ARGB pixels with color table.

-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

+void ARGBColorTableRow_X86(uint8_t* dst_argb,

+                           const uint8_t* table_argb,

                            int width) {

   uintptr_t pixel_temp;

-  asm volatile (

-    // 1 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movzb     " MEMACCESS(0) ",%1             \n"

-    "lea       " MEMLEA(0x4,0) ",%0            \n"

-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1

-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"

-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"

-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1

-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"

-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"

-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1

-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"

-    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"

-    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1

-    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"

-    "dec       %2                              \n"

-    "jg        1b                              \n"

-  : "+r"(dst_argb),     // %0

-    "=&d"(pixel_temp),  // %1

-    "+r"(width)         // %2

-  : "r"(table_argb)     // %3

-  : "memory", "cc");

+  asm volatile(

+      // 1 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movzb     (%0),%1                         \n"

+      "lea       0x4(%0),%0                      \n"

+      "movzb     0x00(%3,%1,4),%1                \n"

+      "mov       %b1,-0x4(%0)                    \n"

+      "movzb     -0x3(%0),%1                     \n"

+      "movzb     0x01(%3,%1,4),%1                \n"

+      "mov       %b1,-0x3(%0)                    \n"

+      "movzb     -0x2(%0),%1                     \n"

+      "movzb     0x02(%3,%1,4),%1                \n"

+      "mov       %b1,-0x2(%0)                    \n"

+      "movzb     -0x1(%0),%1                     \n"

+      "movzb     0x03(%3,%1,4),%1                \n"

+      "mov       %b1,-0x1(%0)                    \n"

+      "dec       %2                              \n"

+      "jg        1b                              \n"

+      : "+r"(dst_argb),     // %0

+        "=&d"(pixel_temp),  // %1

+        "+r"(width)         // %2

+      : "r"(table_argb)     // %3

+      : "memory", "cc");

 #endif  // HAS_ARGBCOLORTABLEROW_X86

 #ifdef HAS_RGBCOLORTABLEROW_X86

 // Tranform RGB pixels with color table.

-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {

+void RGBColorTableRow_X86(uint8_t* dst_argb,

+                          const uint8_t* table_argb,

+                          int width) {

   uintptr_t pixel_temp;

-  asm volatile (

-    // 1 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movzb     " MEMACCESS(0) ",%1             \n"

-    "lea       " MEMLEA(0x4,0) ",%0            \n"

-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1

-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"

-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"

-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1

-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"

-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"

-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1

-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"

-    "dec       %2                              \n"

-    "jg        1b                              \n"

-  : "+r"(dst_argb),     // %0

-    "=&d"(pixel_temp),  // %1

-    "+r"(width)         // %2

-  : "r"(table_argb)     // %3

-  : "memory", "cc");

+  asm volatile(

+      // 1 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movzb     (%0),%1                         \n"

+      "lea       0x4(%0),%0                      \n"

+      "movzb     0x00(%3,%1,4),%1                \n"

+      "mov       %b1,-0x4(%0)                    \n"

+      "movzb     -0x3(%0),%1                     \n"

+      "movzb     0x01(%3,%1,4),%1                \n"

+      "mov       %b1,-0x3(%0)                    \n"

+      "movzb     -0x2(%0),%1                     \n"

+      "movzb     0x02(%3,%1,4),%1                \n"

+      "mov       %b1,-0x2(%0)                    \n"

+      "dec       %2                              \n"

+      "jg        1b                              \n"

+      : "+r"(dst_argb),     // %0

+        "=&d"(pixel_temp),  // %1

+        "+r"(width)         // %2

+      : "r"(table_argb)     // %3

+      : "memory", "cc");

 #endif  // HAS_RGBCOLORTABLEROW_X86

 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3

 // Tranform RGB pixels with luma table.

-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,

+                                 uint8_t* dst_argb,

                                  int width,

-                                 const uint8* luma, uint32 lumacoeff) {

+                                 const uint8_t* luma,

+                                 uint32_t lumacoeff) {

   uintptr_t pixel_temp;

   uintptr_t table_temp;

-  asm volatile (

-    "movd      %6,%%xmm3                       \n"

-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"

-    "pcmpeqb   %%xmm4,%%xmm4                   \n"

-    "psllw     $0x8,%%xmm4                     \n"

-    "pxor      %%xmm5,%%xmm5                   \n"

+  asm volatile(

+      "movd      %6,%%xmm3                       \n"

+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"

+      "pcmpeqb   %%xmm4,%%xmm4                   \n"

+      "psllw     $0x8,%%xmm4                     \n"

+      "pxor      %%xmm5,%%xmm5                   \n"

-    // 4 pixel loop.

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"

-    "pmaddubsw %%xmm3,%%xmm0                   \n"

-    "phaddw    %%xmm0,%%xmm0                   \n"

-    "pand      %%xmm4,%%xmm0                   \n"

-    "punpcklwd %%xmm5,%%xmm0                   \n"

-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

-    "add       %5,%1                           \n"

-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+      // 4 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%2),%%xmm0                     \n"

+      "pmaddubsw %%xmm3,%%xmm0                   \n"

+      "phaddw    %%xmm0,%%xmm0                   \n"

+      "pand      %%xmm4,%%xmm0                   \n"

+      "punpcklwd %%xmm5,%%xmm0                   \n"

+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+      "add       %5,%1                           \n"

+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"

-    "movzb     " MEMACCESS(2) ",%0             \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS(3) "            \n"

-    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0x1,3) "       \n"

-    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0x2,3) "       \n"

-    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"

-    "mov       %b0," MEMACCESS2(0x3,3) "       \n"

+      "movzb     (%2),%0                         \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,(%3)                        \n"

+      "movzb     0x1(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0x1(%3)                     \n"

+      "movzb     0x2(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0x2(%3)                     \n"

+      "movzb     0x3(%2),%0                      \n"

+      "mov       %b0,0x3(%3)                     \n"

-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

-    "add       %5,%1                           \n"

-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+      "add       %5,%1                           \n"

+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"

-    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0x4,3) "       \n"

-    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0x5,3) "       \n"

-    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0x6,3) "       \n"

-    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"

-    "mov       %b0," MEMACCESS2(0x7,3) "       \n"

+      "movzb     0x4(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0x4(%3)                     \n"

+      "movzb     0x5(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0x5(%3)                     \n"

+      "movzb     0x6(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0x6(%3)                     \n"

+      "movzb     0x7(%2),%0                      \n"

+      "mov       %b0,0x7(%3)                     \n"

-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

-    "add       %5,%1                           \n"

-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"

+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+      "add       %5,%1                           \n"

+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"

-    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0x8,3) "       \n"

-    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0x9,3) "       \n"

-    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0xa,3) "       \n"

-    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"

-    "mov       %b0," MEMACCESS2(0xb,3) "       \n"

+      "movzb     0x8(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0x8(%3)                     \n"

+      "movzb     0x9(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0x9(%3)                     \n"

+      "movzb     0xa(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0xa(%3)                     \n"

+      "movzb     0xb(%2),%0                      \n"

+      "mov       %b0,0xb(%3)                     \n"

-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset

-    "add       %5,%1                           \n"

+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset

+      "add       %5,%1                           \n"

-    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0xc,3) "       \n"

-    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0xd,3) "       \n"

-    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"

-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0

-    "mov       %b0," MEMACCESS2(0xe,3) "       \n"

-    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"

-    "mov       %b0," MEMACCESS2(0xf,3) "       \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "lea       " MEMLEA(0x10,3) ",%3           \n"

-    "sub       $0x4,%4                         \n"

-    "jg        1b                              \n"

-  : "=&d"(pixel_temp),  // %0

-    "=&a"(table_temp),  // %1

-    "+r"(src_argb),     // %2

-    "+r"(dst_argb),     // %3

-    "+rm"(width)        // %4

-  : "r"(luma),          // %5

-    "rm"(lumacoeff)     // %6

-  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"

-  );

+      "movzb     0xc(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0xc(%3)                     \n"

+      "movzb     0xd(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0xd(%3)                     \n"

+      "movzb     0xe(%2),%0                      \n"

+      "movzb     0x00(%1,%0,1),%0                \n"

+      "mov       %b0,0xe(%3)                     \n"

+      "movzb     0xf(%2),%0                      \n"

+      "mov       %b0,0xf(%3)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "lea       0x10(%3),%3                     \n"

+      "sub       $0x4,%4                         \n"

+      "jg        1b                              \n"

+      : "=&d"(pixel_temp),  // %0

+        "=&a"(table_temp),  // %1

+        "+r"(src_argb),     // %2

+        "+r"(dst_argb),     // %3

+        "+rm"(width)        // %4

+      : "r"(luma),          // %5

+        "rm"(lumacoeff)     // %6

+      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3

--- a/third_party/libyuv/source/row_mips.cc

+++ /dev/null

@@ -1,782 +1,0 @@

-/*

- *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS. All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "libyuv/row.h"

-#ifdef __cplusplus

-namespace libyuv {

-extern "C" {

-#endif

-// The following are available on Mips platforms:

-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \

-    (_MIPS_SIM == _MIPS_SIM_ABI32)

-#ifdef HAS_COPYROW_MIPS

-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {

-  __asm__ __volatile__ (

-    ".set      noreorder                         \n"

-    ".set      noat                              \n"

-    "slti      $at, %[count], 8                  \n"

-    "bne       $at ,$zero, $last8                \n"

-    "xor       $t8, %[src], %[dst]               \n"

-    "andi      $t8, $t8, 0x3                     \n"

-    "bne       $t8, $zero, unaligned             \n"

-    "negu      $a3, %[dst]                       \n"

-    // make dst/src aligned

-    "andi      $a3, $a3, 0x3                     \n"

-    "beq       $a3, $zero, $chk16w               \n"

-    // word-aligned now count is the remining bytes count

-    "subu     %[count], %[count], $a3            \n"

-    "lwr       $t8, 0(%[src])                    \n"

-    "addu      %[src], %[src], $a3               \n"

-    "swr       $t8, 0(%[dst])                    \n"

-    "addu      %[dst], %[dst], $a3               \n"

-    // Now the dst/src are mutually word-aligned with word-aligned addresses

-    "$chk16w:                                    \n"

-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?

-    // t8 is the byte count after 64-byte chunks

-    "beq       %[count], $t8, chk8w              \n"

-    // There will be at most 1 32-byte chunk after it

-    "subu      $a3, %[count], $t8                \n"  // the reminder

-    // Here a3 counts bytes in 16w chunks

-    "addu      $a3, %[dst], $a3                  \n"

-    // Now a3 is the final dst after 64-byte chunks

-    "addu      $t0, %[dst], %[count]             \n"

-    // t0 is the "past the end" address

-    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past

-    // the "t0-32" address

-    // This means: for x=128 the last "safe" a1 address is "t0-160"

-    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"

-    // we will use "pref 30,128(a1)", so "t0-160" is the limit

-    "subu      $t9, $t0, 160                     \n"

-    // t9 is the "last safe pref 30,128(a1)" address

-    "pref      0, 0(%[src])                      \n"  // first line of src

-    "pref      0, 32(%[src])                     \n"  // second line of src

-    "pref      0, 64(%[src])                     \n"

-    "pref      30, 32(%[dst])                    \n"

-    // In case the a1 > t9 don't use "pref 30" at all

-    "sgtu      $v1, %[dst], $t9                  \n"

-    "bgtz      $v1, $loop16w                     \n"

-    "nop                                         \n"

-    // otherwise, start with using pref30

-    "pref      30, 64(%[dst])                    \n"

-    "$loop16w:                                    \n"

-    "pref      0, 96(%[src])                     \n"

-    "lw        $t0, 0(%[src])                    \n"

-    "bgtz      $v1, $skip_pref30_96              \n"  // skip

-    "lw        $t1, 4(%[src])                    \n"

-    "pref      30, 96(%[dst])                    \n"  // continue

-    "$skip_pref30_96:                            \n"

-    "lw        $t2, 8(%[src])                    \n"

-    "lw        $t3, 12(%[src])                   \n"

-    "lw        $t4, 16(%[src])                   \n"

-    "lw        $t5, 20(%[src])                   \n"

-    "lw        $t6, 24(%[src])                   \n"

-    "lw        $t7, 28(%[src])                   \n"

-    "pref      0, 128(%[src])                    \n"

-    //  bring the next lines of src, addr 128

-    "sw        $t0, 0(%[dst])                    \n"

-    "sw        $t1, 4(%[dst])                    \n"

-    "sw        $t2, 8(%[dst])                    \n"

-    "sw        $t3, 12(%[dst])                   \n"

-    "sw        $t4, 16(%[dst])                   \n"

-    "sw        $t5, 20(%[dst])                   \n"

-    "sw        $t6, 24(%[dst])                   \n"

-    "sw        $t7, 28(%[dst])                   \n"

-    "lw        $t0, 32(%[src])                   \n"

-    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)

-    "lw        $t1, 36(%[src])                   \n"

-    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128

-    "$skip_pref30_128:                           \n"

-    "lw        $t2, 40(%[src])                   \n"

-    "lw        $t3, 44(%[src])                   \n"

-    "lw        $t4, 48(%[src])                   \n"

-    "lw        $t5, 52(%[src])                   \n"

-    "lw        $t6, 56(%[src])                   \n"

-    "lw        $t7, 60(%[src])                   \n"

-    "pref      0, 160(%[src])                    \n"

-    // bring the next lines of src, addr 160

-    "sw        $t0, 32(%[dst])                   \n"

-    "sw        $t1, 36(%[dst])                   \n"

-    "sw        $t2, 40(%[dst])                   \n"

-    "sw        $t3, 44(%[dst])                   \n"

-    "sw        $t4, 48(%[dst])                   \n"

-    "sw        $t5, 52(%[dst])                   \n"

-    "sw        $t6, 56(%[dst])                   \n"

-    "sw        $t7, 60(%[dst])                   \n"

-    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest

-    "sgtu      $v1, %[dst], $t9                  \n"

-    "bne       %[dst], $a3, $loop16w             \n"

-    " addiu    %[src], %[src], 64                \n"  // adding 64 to src

-    "move      %[count], $t8                     \n"

-    // Here we have src and dest word-aligned but less than 64-bytes to go

-    "chk8w:                                      \n"

-    "pref      0, 0x0(%[src])                    \n"

-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?

-    // the t8 is the reminder count past 32-bytes

-    "beq       %[count], $t8, chk1w              \n"

-    // count=t8,no 32-byte chunk

-    " nop                                        \n"

-    "lw        $t0, 0(%[src])                    \n"

-    "lw        $t1, 4(%[src])                    \n"

-    "lw        $t2, 8(%[src])                    \n"

-    "lw        $t3, 12(%[src])                   \n"

-    "lw        $t4, 16(%[src])                   \n"

-    "lw        $t5, 20(%[src])                   \n"

-    "lw        $t6, 24(%[src])                   \n"

-    "lw        $t7, 28(%[src])                   \n"

-    "addiu     %[src], %[src], 32                \n"

-    "sw        $t0, 0(%[dst])                    \n"

-    "sw        $t1, 4(%[dst])                    \n"

-    "sw        $t2, 8(%[dst])                    \n"

-    "sw        $t3, 12(%[dst])                   \n"

-    "sw        $t4, 16(%[dst])                   \n"

-    "sw        $t5, 20(%[dst])                   \n"

-    "sw        $t6, 24(%[dst])                   \n"

-    "sw        $t7, 28(%[dst])                   \n"

-    "addiu     %[dst], %[dst], 32                \n"

-    "chk1w:                                      \n"

-    "andi      %[count], $t8, 0x3                \n"

-    // now count is the reminder past 1w chunks

-    "beq       %[count], $t8, $last8             \n"

-    " subu     $a3, $t8, %[count]                \n"

-    // a3 is count of bytes in 1w chunks

-    "addu      $a3, %[dst], $a3                  \n"

-    // now a3 is the dst address past the 1w chunks

-    // copying in words (4-byte chunks)

-    "$wordCopy_loop:                             \n"

-    "lw        $t3, 0(%[src])                    \n"

-    // the first t3 may be equal t0 ... optimize?

-    "addiu     %[src], %[src],4                  \n"

-    "addiu     %[dst], %[dst],4                  \n"

-    "bne       %[dst], $a3,$wordCopy_loop        \n"

-    " sw       $t3, -4(%[dst])                   \n"

-    // For the last (<8) bytes

-    "$last8:                                     \n"

-    "blez      %[count], leave                   \n"

-    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address

-    "$last8loop:                                 \n"

-    "lb        $v1, 0(%[src])                    \n"

-    "addiu     %[src], %[src], 1                 \n"

-    "addiu     %[dst], %[dst], 1                 \n"

-    "bne       %[dst], $a3, $last8loop           \n"

-    " sb       $v1, -1(%[dst])                   \n"

-    "leave:                                      \n"

-    "  j       $ra                               \n"

-    "  nop                                       \n"

-    //

-    // UNALIGNED case

-    //

-    "unaligned:                                  \n"

-    // got here with a3="negu a1"

-    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?

-    "beqz      $a3, $ua_chk16w                   \n"

-    " subu     %[count], %[count], $a3           \n"

-    // bytes left after initial a3 bytes

-    "lwr       $v1, 0(%[src])                    \n"

-    "lwl       $v1, 3(%[src])                    \n"

-    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3

-    "swr       $v1, 0(%[dst])                    \n"

-    "addu      %[dst], %[dst], $a3               \n"

-    // below the dst will be word aligned (NOTE1)

-    "$ua_chk16w:                                 \n"

-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?

-    // t8 is the byte count after 64-byte chunks

-    "beq       %[count], $t8, ua_chk8w           \n"

-    // if a2==t8, no 64-byte chunks

-    // There will be at most 1 32-byte chunk after it

-    "subu      $a3, %[count], $t8                \n"  // the reminder

-    // Here a3 counts bytes in 16w chunks

-    "addu      $a3, %[dst], $a3                  \n"

-    // Now a3 is the final dst after 64-byte chunks

-    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"

-    "subu      $t9, $t0, 160                     \n"

-    // t9 is the "last safe pref 30,128(a1)" address

-    "pref      0, 0(%[src])                      \n"  // first line of src

-    "pref      0, 32(%[src])                     \n"  // second line  addr 32

-    "pref      0, 64(%[src])                     \n"

-    "pref      30, 32(%[dst])                    \n"

-    // safe, as we have at least 64 bytes ahead

-    // In case the a1 > t9 don't use "pref 30" at all

-    "sgtu      $v1, %[dst], $t9                  \n"

-    "bgtz      $v1, $ua_loop16w                  \n"

-    // skip "pref 30,64(a1)" for too short arrays

-    " nop                                        \n"

-    // otherwise, start with using pref30

-    "pref      30, 64(%[dst])                    \n"

-    "$ua_loop16w:                                \n"

-    "pref      0, 96(%[src])                     \n"

-    "lwr       $t0, 0(%[src])                    \n"

-    "lwl       $t0, 3(%[src])                    \n"

-    "lwr       $t1, 4(%[src])                    \n"

-    "bgtz      $v1, $ua_skip_pref30_96           \n"

-    " lwl      $t1, 7(%[src])                    \n"

-    "pref      30, 96(%[dst])                    \n"

-    // continue setting up the dest, addr 96

-    "$ua_skip_pref30_96:                         \n"

-    "lwr       $t2, 8(%[src])                    \n"

-    "lwl       $t2, 11(%[src])                   \n"

-    "lwr       $t3, 12(%[src])                   \n"

-    "lwl       $t3, 15(%[src])                   \n"

-    "lwr       $t4, 16(%[src])                   \n"

-    "lwl       $t4, 19(%[src])                   \n"

-    "lwr       $t5, 20(%[src])                   \n"

-    "lwl       $t5, 23(%[src])                   \n"

-    "lwr       $t6, 24(%[src])                   \n"

-    "lwl       $t6, 27(%[src])                   \n"

-    "lwr       $t7, 28(%[src])                   \n"

-    "lwl       $t7, 31(%[src])                   \n"

-    "pref      0, 128(%[src])                    \n"

-    // bring the next lines of src, addr 128

-    "sw        $t0, 0(%[dst])                    \n"

-    "sw        $t1, 4(%[dst])                    \n"

-    "sw        $t2, 8(%[dst])                    \n"

-    "sw        $t3, 12(%[dst])                   \n"

-    "sw        $t4, 16(%[dst])                   \n"

-    "sw        $t5, 20(%[dst])                   \n"

-    "sw        $t6, 24(%[dst])                   \n"

-    "sw        $t7, 28(%[dst])                   \n"

-    "lwr       $t0, 32(%[src])                   \n"

-    "lwl       $t0, 35(%[src])                   \n"

-    "lwr       $t1, 36(%[src])                   \n"

-    "bgtz      $v1, ua_skip_pref30_128           \n"

-    " lwl      $t1, 39(%[src])                   \n"

-    "pref      30, 128(%[dst])                   \n"

-    // continue setting up the dest, addr 128

-    "ua_skip_pref30_128:                         \n"

-    "lwr       $t2, 40(%[src])                   \n"

-    "lwl       $t2, 43(%[src])                   \n"

-    "lwr       $t3, 44(%[src])                   \n"

-    "lwl       $t3, 47(%[src])                   \n"

-    "lwr       $t4, 48(%[src])                   \n"

-    "lwl       $t4, 51(%[src])                   \n"

-    "lwr       $t5, 52(%[src])                   \n"

-    "lwl       $t5, 55(%[src])                   \n"

-    "lwr       $t6, 56(%[src])                   \n"

-    "lwl       $t6, 59(%[src])                   \n"

-    "lwr       $t7, 60(%[src])                   \n"

-    "lwl       $t7, 63(%[src])                   \n"

-    "pref      0, 160(%[src])                    \n"

-    // bring the next lines of src, addr 160

-    "sw        $t0, 32(%[dst])                   \n"

-    "sw        $t1, 36(%[dst])                   \n"

-    "sw        $t2, 40(%[dst])                   \n"

-    "sw        $t3, 44(%[dst])                   \n"

-    "sw        $t4, 48(%[dst])                   \n"

-    "sw        $t5, 52(%[dst])                   \n"

-    "sw        $t6, 56(%[dst])                   \n"

-    "sw        $t7, 60(%[dst])                   \n"

-    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest

-    "sgtu      $v1,%[dst],$t9                    \n"

-    "bne       %[dst],$a3,$ua_loop16w            \n"

-    " addiu    %[src],%[src],64                  \n"  // adding 64 to src

-    "move      %[count],$t8                      \n"

-    // Here we have src and dest word-aligned but less than 64-bytes to go

-    "ua_chk8w:                                   \n"

-    "pref      0, 0x0(%[src])                    \n"

-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?

-    // the t8 is the reminder count

-    "beq       %[count], $t8, $ua_chk1w          \n"

-    // when count==t8, no 32-byte chunk

-    "lwr       $t0, 0(%[src])                    \n"

-    "lwl       $t0, 3(%[src])                    \n"

-    "lwr       $t1, 4(%[src])                    \n"

-    "lwl       $t1, 7(%[src])                    \n"

-    "lwr       $t2, 8(%[src])                    \n"

-    "lwl       $t2, 11(%[src])                   \n"

-    "lwr       $t3, 12(%[src])                   \n"

-    "lwl       $t3, 15(%[src])                   \n"

-    "lwr       $t4, 16(%[src])                   \n"

-    "lwl       $t4, 19(%[src])                   \n"

-    "lwr       $t5, 20(%[src])                   \n"

-    "lwl       $t5, 23(%[src])                   \n"

-    "lwr       $t6, 24(%[src])                   \n"

-    "lwl       $t6, 27(%[src])                   \n"

-    "lwr       $t7, 28(%[src])                   \n"

-    "lwl       $t7, 31(%[src])                   \n"

-    "addiu     %[src], %[src], 32                \n"

-    "sw        $t0, 0(%[dst])                    \n"

-    "sw        $t1, 4(%[dst])                    \n"

-    "sw        $t2, 8(%[dst])                    \n"

-    "sw        $t3, 12(%[dst])                   \n"

-    "sw        $t4, 16(%[dst])                   \n"

-    "sw        $t5, 20(%[dst])                   \n"

-    "sw        $t6, 24(%[dst])                   \n"

-    "sw        $t7, 28(%[dst])                   \n"

-    "addiu     %[dst], %[dst], 32                \n"

-    "$ua_chk1w:                                  \n"

-    "andi      %[count], $t8, 0x3                \n"

-    // now count is the reminder past 1w chunks

-    "beq       %[count], $t8, ua_smallCopy       \n"

-    "subu      $a3, $t8, %[count]                \n"

-    // a3 is count of bytes in 1w chunks

-    "addu      $a3, %[dst], $a3                  \n"

-    // now a3 is the dst address past the 1w chunks

-    // copying in words (4-byte chunks)

-    "$ua_wordCopy_loop:                          \n"

-    "lwr       $v1, 0(%[src])                    \n"

-    "lwl       $v1, 3(%[src])                    \n"

-    "addiu     %[src], %[src], 4                 \n"

-    "addiu     %[dst], %[dst], 4                 \n"

-    // note: dst=a1 is word aligned here, see NOTE1

-    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"

-    " sw       $v1,-4(%[dst])                    \n"

-    // Now less than 4 bytes (value in count) left to copy

-    "ua_smallCopy:                               \n"

-    "beqz      %[count], leave                   \n"

-    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address

-    "$ua_smallCopy_loop:                         \n"

-    "lb        $v1, 0(%[src])                    \n"

-    "addiu     %[src], %[src], 1                 \n"

-    "addiu     %[dst], %[dst], 1                 \n"

-    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"

-    " sb       $v1, -1(%[dst])                   \n"

-    "j         $ra                               \n"

-    " nop                                        \n"

-    ".set      at                                \n"

-    ".set      reorder                           \n"

-       : [dst] "+r" (dst), [src] "+r" (src)

-       : [count] "r" (count)

-       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",

-       "t8", "t9", "a3", "v1", "at"

-  );

-}

-#endif  // HAS_COPYROW_MIPS

-// DSPR2 functions

-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \

-    (__mips_dsp_rev >= 2) && \

-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)

-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                      int width) {

-  __asm__ __volatile__ (

-    ".set push                                     \n"

-    ".set noreorder                                \n"

-    "srl             $t4, %[width], 4              \n"  // multiplies of 16

-    "blez            $t4, 2f                       \n"

-    " andi           %[width], %[width], 0xf       \n"  // residual

-  "1:                                              \n"

-    "addiu           $t4, $t4, -1                  \n"

-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0

-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2

-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4

-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6

-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8

-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10

-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12

-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14

-    "addiu           %[src_uv], %[src_uv], 32      \n"

-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0

-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0

-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4

-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4

-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8

-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8

-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12

-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12

-    "sw              $t9, 0(%[dst_v])              \n"

-    "sw              $t0, 0(%[dst_u])              \n"

-    "sw              $t1, 4(%[dst_v])              \n"

-    "sw              $t2, 4(%[dst_u])              \n"

-    "sw              $t3, 8(%[dst_v])              \n"

-    "sw              $t5, 8(%[dst_u])              \n"

-    "sw              $t6, 12(%[dst_v])             \n"

-    "sw              $t7, 12(%[dst_u])             \n"

-    "addiu           %[dst_v], %[dst_v], 16        \n"

-    "bgtz            $t4, 1b                       \n"

-    " addiu          %[dst_u], %[dst_u], 16        \n"

-    "beqz            %[width], 3f                  \n"

-    " nop                                          \n"

-  "2:                                              \n"

-    "lbu             $t0, 0(%[src_uv])             \n"

-    "lbu             $t1, 1(%[src_uv])             \n"

-    "addiu           %[src_uv], %[src_uv], 2       \n"

-    "addiu           %[width], %[width], -1        \n"

-    "sb              $t0, 0(%[dst_u])              \n"

-    "sb              $t1, 0(%[dst_v])              \n"

-    "addiu           %[dst_u], %[dst_u], 1         \n"

-    "bgtz            %[width], 2b                  \n"

-    " addiu          %[dst_v], %[dst_v], 1         \n"

-  "3:                                              \n"

-    ".set pop                                      \n"

-     : [src_uv] "+r" (src_uv),

-       [width] "+r" (width),

-       [dst_u] "+r" (dst_u),

-       [dst_v] "+r" (dst_v)

-     :

-     : "t0", "t1", "t2", "t3",

-     "t4", "t5", "t6", "t7", "t8", "t9"

-  );

-}

-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {

-  __asm__ __volatile__ (

-    ".set push                             \n"

-    ".set noreorder                        \n"

-    "srl       $t4, %[width], 4            \n"  // multiplies of 16

-    "andi      $t5, %[width], 0xf          \n"

-    "blez      $t4, 2f                     \n"

-    " addu     %[src], %[src], %[width]    \n"  // src += width

-   "1:                                     \n"

-    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|

-    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|

-    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|

-    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|

-    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|

-    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|

-    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|

-    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|

-    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|

-    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|

-    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|

-    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|

-    "addiu     %[src], %[src], -16         \n"

-    "addiu     $t4, $t4, -1                \n"

-    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|

-    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|

-    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|

-    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|

-    "bgtz      $t4, 1b                     \n"

-    " addiu    %[dst], %[dst], 16          \n"

-    "beqz      $t5, 3f                     \n"

-    " nop                                  \n"

-   "2:                                     \n"

-    "lbu       $t0, -1(%[src])             \n"

-    "addiu     $t5, $t5, -1                \n"

-    "addiu     %[src], %[src], -1          \n"

-    "sb        $t0, 0(%[dst])              \n"

-    "bgez      $t5, 2b                     \n"

-    " addiu    %[dst], %[dst], 1           \n"

-   "3:                                     \n"

-    ".set pop                              \n"

-      : [src] "+r" (src), [dst] "+r" (dst)

-      : [width] "r" (width)

-      : "t0", "t1", "t2", "t3", "t4", "t5"

-  );

-}

-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                       int width) {

-  int x;

-  int y;

-  __asm__ __volatile__ (

-    ".set push                                    \n"

-    ".set noreorder                               \n"

-    "addu            $t4, %[width], %[width]      \n"

-    "srl             %[x], %[width], 4            \n"

-    "andi            %[y], %[width], 0xf          \n"

-    "blez            %[x], 2f                     \n"

-    " addu           %[src_uv], %[src_uv], $t4    \n"

-   "1:                                            \n"

-    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|

-    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|

-    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|

-    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|

-    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|

-    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|

-    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|

-    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|

-    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|

-    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|

-    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|

-    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|

-    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|

-    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|

-    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|

-    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|

-    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|

-    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|

-    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|

-    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|

-    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|

-    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|

-    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|

-    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|

-    "addiu           %[src_uv], %[src_uv], -32    \n"

-    "addiu           %[x], %[x], -1               \n"

-    "swr             $t4, 0(%[dst_u])             \n"

-    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|

-    "swr             $t6, 0(%[dst_v])             \n"

-    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|

-    "swr             $t2, 4(%[dst_u])             \n"

-    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|

-    "swr             $t3, 4(%[dst_v])             \n"

-    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|

-    "swr             $t0, 8(%[dst_u])             \n"

-    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|

-    "swr             $t1, 8(%[dst_v])             \n"

-    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|

-    "swr             $t9, 12(%[dst_u])            \n"

-    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|

-    "swr             $t5, 12(%[dst_v])            \n"

-    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|

-    "addiu           %[dst_v], %[dst_v], 16       \n"

-    "bgtz            %[x], 1b                     \n"

-    " addiu          %[dst_u], %[dst_u], 16       \n"

-    "beqz            %[y], 3f                     \n"

-    " nop                                         \n"

-    "b               2f                           \n"

-    " nop                                         \n"

-   "2:                                            \n"

-    "lbu             $t0, -2(%[src_uv])           \n"

-    "lbu             $t1, -1(%[src_uv])           \n"

-    "addiu           %[src_uv], %[src_uv], -2     \n"

-    "addiu           %[y], %[y], -1               \n"

-    "sb              $t0, 0(%[dst_u])             \n"

-    "sb              $t1, 0(%[dst_v])             \n"

-    "addiu           %[dst_u], %[dst_u], 1        \n"

-    "bgtz            %[y], 2b                     \n"

-    " addiu          %[dst_v], %[dst_v], 1        \n"

-   "3:                                            \n"

-    ".set pop                                     \n"

-      : [src_uv] "+r" (src_uv),

-        [dst_u] "+r" (dst_u),

-        [dst_v] "+r" (dst_v),

-        [x] "=&r" (x),

-        [y] "=&r" (y)

-      : [width] "r" (width)

-      : "t0", "t1", "t2", "t3", "t4",

-      "t5", "t7", "t8", "t9"

-  );

-}

-// Convert (4 Y and 2 VU) I422 and arrange RGB values into

-// t5 = | 0 | B0 | 0 | b0 |

-// t4 = | 0 | B1 | 0 | b1 |

-// t9 = | 0 | G0 | 0 | g0 |

-// t8 = | 0 | G1 | 0 | g1 |

-// t2 = | 0 | R0 | 0 | r0 |

-// t1 = | 0 | R1 | 0 | r1 |

-#define YUVTORGB                                                               \

-      "lw                $t0, 0(%[y_buf])       \n"                            \

-      "lhu               $t1, 0(%[u_buf])       \n"                            \

-      "lhu               $t2, 0(%[v_buf])       \n"                            \

-      "preceu.ph.qbr     $t1, $t1               \n"                            \

-      "preceu.ph.qbr     $t2, $t2               \n"                            \

-      "preceu.ph.qbra    $t3, $t0               \n"                            \

-      "preceu.ph.qbla    $t0, $t0               \n"                            \

-      "subu.ph           $t1, $t1, $s5          \n"                            \

-      "subu.ph           $t2, $t2, $s5          \n"                            \

-      "subu.ph           $t3, $t3, $s4          \n"                            \

-      "subu.ph           $t0, $t0, $s4          \n"                            \

-      "mul.ph            $t3, $t3, $s0          \n"                            \

-      "mul.ph            $t0, $t0, $s0          \n"                            \

-      "shll.ph           $t4, $t1, 0x7          \n"                            \

-      "subu.ph           $t4, $t4, $t1          \n"                            \

-      "mul.ph            $t6, $t1, $s1          \n"                            \

-      "mul.ph            $t1, $t2, $s2          \n"                            \

-      "addq_s.ph         $t5, $t4, $t3          \n"                            \

-      "addq_s.ph         $t4, $t4, $t0          \n"                            \

-      "shra.ph           $t5, $t5, 6            \n"                            \

-      "shra.ph           $t4, $t4, 6            \n"                            \

-      "addiu             %[u_buf], 2            \n"                            \

-      "addiu             %[v_buf], 2            \n"                            \

-      "addu.ph           $t6, $t6, $t1          \n"                            \

-      "mul.ph            $t1, $t2, $s3          \n"                            \

-      "addu.ph           $t9, $t6, $t3          \n"                            \

-      "addu.ph           $t8, $t6, $t0          \n"                            \

-      "shra.ph           $t9, $t9, 6            \n"                            \

-      "shra.ph           $t8, $t8, 6            \n"                            \

-      "addu.ph           $t2, $t1, $t3          \n"                            \

-      "addu.ph           $t1, $t1, $t0          \n"                            \

-      "shra.ph           $t2, $t2, 6            \n"                            \

-      "shra.ph           $t1, $t1, 6            \n"                            \

-      "subu.ph           $t5, $t5, $s5          \n"                            \

-      "subu.ph           $t4, $t4, $s5          \n"                            \

-      "subu.ph           $t9, $t9, $s5          \n"                            \

-      "subu.ph           $t8, $t8, $s5          \n"                            \

-      "subu.ph           $t2, $t2, $s5          \n"                            \

-      "subu.ph           $t1, $t1, $s5          \n"                            \

-      "shll_s.ph         $t5, $t5, 8            \n"                            \

-      "shll_s.ph         $t4, $t4, 8            \n"                            \

-      "shll_s.ph         $t9, $t9, 8            \n"                            \

-      "shll_s.ph         $t8, $t8, 8            \n"                            \

-      "shll_s.ph         $t2, $t2, 8            \n"                            \

-      "shll_s.ph         $t1, $t1, 8            \n"                            \

-      "shra.ph           $t5, $t5, 8            \n"                            \

-      "shra.ph           $t4, $t4, 8            \n"                            \

-      "shra.ph           $t9, $t9, 8            \n"                            \

-      "shra.ph           $t8, $t8, 8            \n"                            \

-      "shra.ph           $t2, $t2, 8            \n"                            \

-      "shra.ph           $t1, $t1, 8            \n"                            \

-      "addu.ph           $t5, $t5, $s5          \n"                            \

-      "addu.ph           $t4, $t4, $s5          \n"                            \

-      "addu.ph           $t9, $t9, $s5          \n"                            \

-      "addu.ph           $t8, $t8, $s5          \n"                            \

-      "addu.ph           $t2, $t2, $s5          \n"                            \

-      "addu.ph           $t1, $t1, $s5          \n"

-// TODO(fbarchard): accept yuv conversion constants.

-void I422ToARGBRow_DSPR2(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* rgb_buf,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

-  __asm__ __volatile__ (

-    ".set push                                \n"

-    ".set noreorder                           \n"

-    "beqz              %[width], 2f           \n"

-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|

-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|

-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|

-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|

-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|

-    "repl.ph           $s5, 128               \n"  // |128|128| // clipping

-    "lui               $s6, 0xff00            \n"

-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|

-   "1:                                        \n"

-      YUVTORGB

-// Arranging into argb format

-    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|

-    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|

-    "addiu             %[width], -4           \n"

-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|

-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|

-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|

-    "addiu             %[y_buf], 4            \n"

-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|

-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|

-    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|

-    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|

-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|

-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|

-    "sll               $t9, $t9, 16           \n"

-    "sll               $t8, $t8, 16           \n"

-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|

-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|

-// Store results.

-    "sw                $t2, 0(%[rgb_buf])     \n"

-    "sw                $t0, 4(%[rgb_buf])     \n"

-    "sw                $t1, 8(%[rgb_buf])     \n"

-    "sw                $t3, 12(%[rgb_buf])    \n"

-    "bnez              %[width], 1b           \n"

-    " addiu            %[rgb_buf], 16         \n"

-   "2:                                        \n"

-    ".set pop                                 \n"

-      :[y_buf] "+r" (y_buf),

-       [u_buf] "+r" (u_buf),

-       [v_buf] "+r" (v_buf),

-       [width] "+r" (width),

-       [rgb_buf] "+r" (rgb_buf)

-      :

-      : "t0", "t1",  "t2", "t3",  "t4", "t5",

-      "t6", "t7", "t8", "t9",

-      "s0", "s1", "s2", "s3",

-      "s4", "s5", "s6"

-  );

-}

-// Bilinear filter 8x2 -> 8x1

-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,

-                          ptrdiff_t src_stride, int dst_width,

-                          int source_y_fraction) {

-    int y0_fraction = 256 - source_y_fraction;

-    const uint8* src_ptr1 = src_ptr + src_stride;

-  __asm__ __volatile__ (

-     ".set push                                           \n"

-     ".set noreorder                                      \n"

-     "replv.ph          $t0, %[y0_fraction]               \n"

-     "replv.ph          $t1, %[source_y_fraction]         \n"

-   "1:                                                    \n"

-     "lw                $t2, 0(%[src_ptr])                \n"

-     "lw                $t3, 0(%[src_ptr1])               \n"

-     "lw                $t4, 4(%[src_ptr])                \n"

-     "lw                $t5, 4(%[src_ptr1])               \n"

-     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"

-     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"

-     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"

-     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"

-     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"

-     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"

-     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"

-     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"

-     "addq.ph           $t6, $t6, $t8                     \n"

-     "addq.ph           $t7, $t7, $t9                     \n"

-     "addq.ph           $t2, $t2, $t4                     \n"

-     "addq.ph           $t3, $t3, $t5                     \n"

-     "shra.ph           $t6, $t6, 8                       \n"

-     "shra.ph           $t7, $t7, 8                       \n"

-     "shra.ph           $t2, $t2, 8                       \n"

-     "shra.ph           $t3, $t3, 8                       \n"

-     "precr.qb.ph       $t6, $t6, $t7                     \n"

-     "precr.qb.ph       $t2, $t2, $t3                     \n"

-     "addiu             %[src_ptr], %[src_ptr], 8         \n"

-     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"

-     "addiu             %[dst_width], %[dst_width], -8    \n"

-     "sw                $t6, 0(%[dst_ptr])                \n"

-     "sw                $t2, 4(%[dst_ptr])                \n"

-     "bgtz              %[dst_width], 1b                  \n"

-     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"

-     ".set pop                                            \n"

-  : [dst_ptr] "+r" (dst_ptr),

-    [src_ptr1] "+r" (src_ptr1),

-    [src_ptr] "+r" (src_ptr),

-    [dst_width] "+r" (dst_width)

-  : [source_y_fraction] "r" (source_y_fraction),

-    [y0_fraction] "r" (y0_fraction),

-    [src_stride] "r" (src_stride)

-  : "t0", "t1", "t2", "t3", "t4", "t5",

-    "t6", "t7", "t8", "t9"

-  );

-}

-#endif  // __mips_dsp_rev >= 2

-#endif  // defined(__mips__)

-#ifdef __cplusplus

-}  // extern "C"

-}  // namespace libyuv

-#endif

--- /dev/null

+++ b/third_party/libyuv/source/row_msa.cc

@@ -1,0 +1,3512 @@

+/*

+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <string.h>

+#include "libyuv/row.h"

+// This module is for GCC MSA

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#include "libyuv/macros_msa.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#define ALPHA_VAL (-1)

+// Fill YUV -> RGB conversion constants into vectors

+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \

+  {                                                              \

+    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \

+    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \

+    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \

+    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \

+    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \

+    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \

+    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \

+    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \

+  }

+// Load YUV 422 pixel data

+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)    \

+  {                                                                \

+    uint64_t y_m;                                                  \

+    uint32_t u_m, v_m;                                             \

+    v4i32 zero_m = {0};                                            \

+    y_m = LD(psrc_y);                                              \

+    u_m = LW(psrc_u);                                              \

+    v_m = LW(psrc_v);                                              \

+    out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \

+    out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m);        \

+    out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m);        \

+  }

+// Clip input vector elements between 0 to 255

+#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \

+  {                                               \

+    v4i32 max_m = __msa_ldi_w(0xFF);              \

+                                                  \

+    in0 = __msa_maxi_s_w(in0, 0);                 \

+    in1 = __msa_maxi_s_w(in1, 0);                 \

+    in2 = __msa_maxi_s_w(in2, 0);                 \

+    in3 = __msa_maxi_s_w(in3, 0);                 \

+    in4 = __msa_maxi_s_w(in4, 0);                 \

+    in5 = __msa_maxi_s_w(in5, 0);                 \

+    in0 = __msa_min_s_w(max_m, in0);              \

+    in1 = __msa_min_s_w(max_m, in1);              \

+    in2 = __msa_min_s_w(max_m, in2);              \

+    in3 = __msa_min_s_w(max_m, in3);              \

+    in4 = __msa_min_s_w(max_m, in4);              \

+    in5 = __msa_min_s_w(max_m, in5);              \

+  }

+// Convert 8 pixels of YUV 420 to RGB.

+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \

+  {                                                                            \

+    v8i16 vec0_m, vec1_m;                                                      \

+    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \

+    v4i32 reg5_m, reg6_m, reg7_m;                                              \

+    v16i8 zero_m = {0};                                                        \

+                                                                               \

+    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \

+    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \

+    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \

+    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \

+    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \

+    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \

+    reg0_m *= yg;                                                              \

+    reg1_m *= yg;                                                              \

+    reg2_m *= ubvr;                                                            \

+    reg3_m *= ubvr;                                                            \

+    reg0_m = __msa_srai_w(reg0_m, 16);                                         \

+    reg1_m = __msa_srai_w(reg1_m, 16);                                         \

+    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \

+    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \

+    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \

+    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \

+    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \

+    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \

+    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \

+    reg5_m = reg0_m - reg5_m;                                                  \

+    reg6_m = reg1_m - reg6_m;                                                  \

+    reg2_m = reg0_m - reg2_m;                                                  \

+    reg3_m = reg1_m - reg3_m;                                                  \

+    reg7_m = reg0_m - reg7_m;                                                  \

+    reg4_m = reg1_m - reg4_m;                                                  \

+    reg5_m += bb;                                                              \

+    reg6_m += bb;                                                              \

+    reg7_m += bg;                                                              \

+    reg4_m += bg;                                                              \

+    reg2_m += br;                                                              \

+    reg3_m += br;                                                              \

+    reg5_m = __msa_srai_w(reg5_m, 6);                                          \

+    reg6_m = __msa_srai_w(reg6_m, 6);                                          \

+    reg7_m = __msa_srai_w(reg7_m, 6);                                          \

+    reg4_m = __msa_srai_w(reg4_m, 6);                                          \

+    reg2_m = __msa_srai_w(reg2_m, 6);                                          \

+    reg3_m = __msa_srai_w(reg3_m, 6);                                          \

+    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \

+    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \

+    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \

+    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \

+  }

+// Pack and Store 8 ARGB values.

+#define STOREARGB(in0, in1, in2, in3, pdst_argb)           \

+  {                                                        \

+    v8i16 vec0_m, vec1_m;                                  \

+    v16u8 dst0_m, dst1_m;                                  \

+    vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \

+    vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \

+    dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \

+    dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \

+    ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \

+  }

+// Takes ARGB input and calculates Y.

+#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \

+                y_out)                                                     \

+  {                                                                        \

+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \

+    v8u16 reg0_m, reg1_m;                                                  \

+                                                                           \

+    vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \

+    vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \

+    vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \

+    vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \

+    reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \

+    reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \

+    reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \

+    reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \

+    reg0_m += const2;                                                      \

+    reg1_m += const2;                                                      \

+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \

+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \

+    y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \

+  }

+// Loads current and next row of ARGB input and averages it to calculate U and V

+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \

+  {                                                                       \

+    v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \

+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \

+    v16u8 vec8_m, vec9_m;                                                 \

+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \

+    v8u16 reg8_m, reg9_m;                                                 \

+                                                                          \

+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0);                             \

+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16);                            \

+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32);                            \

+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48);                            \

+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0);                             \

+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16);                            \

+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32);                            \

+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48);                            \

+    vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \

+    vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \

+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \

+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \

+    vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \

+    vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \

+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \

+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \

+    reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \

+    reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \

+    reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \

+    reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \

+    reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \

+    reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \

+    reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \

+    reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \

+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \

+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \

+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \

+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \

+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \

+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \

+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \

+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \

+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \

+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \

+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \

+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \

+    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \

+    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \

+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64);                            \

+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80);                            \

+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96);                            \

+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112);                           \

+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64);                            \

+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80);                            \

+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96);                            \

+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112);                           \

+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \

+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \

+    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \

+    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \

+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \

+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \

+    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \

+    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \

+    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \

+    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \

+    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \

+    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \

+    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \

+    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \

+    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \

+    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \

+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \

+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \

+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \

+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \

+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \

+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \

+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \

+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \

+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \

+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \

+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \

+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \

+    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \

+    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \

+  }

+// Takes ARGB input and calculates U and V.

+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \

+                 shf0, shf1, shf2, shf3, v_out, u_out)                       \

+  {                                                                          \

+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \

+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \

+                                                                             \

+    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \

+    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \

+    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \

+    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \

+    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \

+    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \

+    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \

+    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \

+    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \

+    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \

+    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \

+    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \

+    reg0_m += const3;                                                        \

+    reg1_m += const3;                                                        \

+    reg2_m += const3;                                                        \

+    reg3_m += const3;                                                        \

+    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \

+    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \

+    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \

+    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \

+    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \

+    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \

+  }

+// Load I444 pixel data

+#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \

+  {                                                           \

+    uint64_t y_m, u_m, v_m;                                   \

+    v2i64 zero_m = {0};                                       \

+    y_m = LD(psrc_y);                                         \

+    u_m = LD(psrc_u);                                         \

+    v_m = LD(psrc_v);                                         \

+    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m);   \

+    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m);   \

+    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);   \

+  }

+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3;

+  v16u8 dst0, dst1, dst2, dst3;

+  v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};

+  src += width - 64;

+  for (x = 0; x < width; x += 64) {

+    LD_UB4(src, 16, src3, src2, src1, src0);

+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);

+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);

+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);

+    dst += 64;

+    src -= 64;

+  }

+}

+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3;

+  v16u8 dst0, dst1, dst2, dst3;

+  v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};

+  src += width * 4 - 64;

+  for (x = 0; x < width; x += 16) {

+    LD_UB4(src, 16, src3, src2, src1, src0);

+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);

+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);

+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);

+    dst += 64;

+    src -= 64;

+  }

+}

+void I422ToYUY2Row_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_yuy2,

+                       int width) {

+  int x;

+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;

+  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;

+  for (x = 0; x < width; x += 32) {

+    src_u0 = LD_UB(src_u);

+    src_v0 = LD_UB(src_v);

+    LD_UB2(src_y, 16, src_y0, src_y1);

+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);

+    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);

+    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);

+    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);

+    src_u += 16;

+    src_v += 16;

+    src_y += 32;

+    dst_yuy2 += 64;

+  }

+}

+void I422ToUYVYRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_uyvy,

+                       int width) {

+  int x;

+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;

+  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;

+  for (x = 0; x < width; x += 32) {

+    src_u0 = LD_UB(src_u);

+    src_v0 = LD_UB(src_v);

+    LD_UB2(src_y, 16, src_y0, src_y1);

+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);

+    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);

+    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);

+    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);

+    src_u += 16;

+    src_v += 16;

+    src_y += 32;

+    dst_uyvy += 64;

+  }

+}

+void I422ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width) {

+  int x;

+  v16u8 src0, src1, src2;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    READYUV422(src_y, src_u, src_v, src0, src1, src2);

+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);

+    src_y += 8;

+    src_u += 4;

+    src_v += 4;

+    dst_argb += 32;

+  }

+}

+void I422ToRGBARow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width) {

+  int x;

+  v16u8 src0, src1, src2;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    READYUV422(src_y, src_u, src_v, src0, src1, src2);

+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    STOREARGB(alpha, vec0, vec1, vec2, dst_argb);

+    src_y += 8;

+    src_u += 4;

+    src_v += 4;

+    dst_argb += 32;

+  }

+}

+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            const uint8_t* src_a,

+                            uint8_t* dst_argb,

+                            const struct YuvConstants* yuvconstants,

+                            int width) {

+  int x;

+  int64_t data_a;

+  v16u8 src0, src1, src2, src3;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v4i32 zero = {0};

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    data_a = LD(src_a);

+    READYUV422(src_y, src_u, src_v, src0, src1, src2);

+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);

+    src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);

+    STOREARGB(vec0, vec1, vec2, src3, dst_argb);

+    src_y += 8;

+    src_u += 4;

+    src_v += 4;

+    src_a += 8;

+    dst_argb += 32;

+  }

+}

+void I422ToRGB24Row_MSA(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

+                        const struct YuvConstants* yuvconstants,

+                        int32_t width) {

+  int x;

+  int64_t data_u, data_v;

+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;

+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 reg0, reg1, reg2, reg3;

+  v2i64 zero = {0};

+  v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};

+  v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};

+  v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,

+                     11, 29, 12, 13, 30, 14, 15, 31};

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);

+    data_u = LD(src_u);

+    data_v = LD(src_v);

+    src1 = (v16u8)__msa_insert_d(zero, 0, data_u);

+    src2 = (v16u8)__msa_insert_d(zero, 0, data_v);

+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);

+    src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);

+    src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec3, vec4, vec5);

+    reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);

+    reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);

+    reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);

+    reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);

+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);

+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);

+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    ST_UB(dst2, (dst_argb + 32));

+    src_y += 16;

+    src_u += 8;

+    src_v += 8;

+    dst_argb += 48;

+  }

+}

+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.

+void I422ToRGB565Row_MSA(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_rgb565,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

+  int x;

+  v16u8 src0, src1, src2, dst0;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    READYUV422(src_y, src_u, src_v, src0, src1, src2);

+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec2, vec1);

+    vec0 = __msa_srai_h(vec0, 3);

+    vec1 = __msa_srai_h(vec1, 3);

+    vec2 = __msa_srai_h(vec2, 2);

+    vec1 = __msa_slli_h(vec1, 11);

+    vec2 = __msa_slli_h(vec2, 5);

+    vec0 |= vec1;

+    dst0 = (v16u8)(vec2 | vec0);

+    ST_UB(dst0, dst_rgb565);

+    src_y += 8;

+    src_u += 4;

+    src_v += 4;

+    dst_rgb565 += 16;

+  }

+}

+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.

+void I422ToARGB4444Row_MSA(const uint8_t* src_y,

+                           const uint8_t* src_u,

+                           const uint8_t* src_v,

+                           uint8_t* dst_argb4444,

+                           const struct YuvConstants* yuvconstants,

+                           int width) {

+  int x;

+  v16u8 src0, src1, src2, dst0;

+  v8i16 vec0, vec1, vec2;

+  v8u16 reg0, reg1, reg2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    READYUV422(src_y, src_u, src_v, src0, src1, src2);

+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    reg0 = (v8u16)__msa_srai_h(vec0, 4);

+    reg1 = (v8u16)__msa_srai_h(vec1, 4);

+    reg2 = (v8u16)__msa_srai_h(vec2, 4);

+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);

+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);

+    reg1 |= const_0xF000;

+    reg0 |= reg2;

+    dst0 = (v16u8)(reg1 | reg0);

+    ST_UB(dst0, dst_argb4444);

+    src_y += 8;

+    src_u += 4;

+    src_v += 4;

+    dst_argb4444 += 16;

+  }

+}

+void I422ToARGB1555Row_MSA(const uint8_t* src_y,

+                           const uint8_t* src_u,

+                           const uint8_t* src_v,

+                           uint8_t* dst_argb1555,

+                           const struct YuvConstants* yuvconstants,

+                           int width) {

+  int x;

+  v16u8 src0, src1, src2, dst0;

+  v8i16 vec0, vec1, vec2;

+  v8u16 reg0, reg1, reg2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    READYUV422(src_y, src_u, src_v, src0, src1, src2);

+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    reg0 = (v8u16)__msa_srai_h(vec0, 3);

+    reg1 = (v8u16)__msa_srai_h(vec1, 3);

+    reg2 = (v8u16)__msa_srai_h(vec2, 3);

+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);

+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);

+    reg1 |= const_0x8000;

+    reg0 |= reg2;

+    dst0 = (v16u8)(reg1 | reg0);

+    ST_UB(dst0, dst_argb1555);

+    src_y += 8;

+    src_u += 4;

+    src_v += 4;

+    dst_argb1555 += 16;

+  }

+}

+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  for (x = 0; x < width; x += 32) {

+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    ST_UB2(dst0, dst1, dst_y, 16);

+    src_yuy2 += 64;

+    dst_y += 32;

+  }

+}

+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,

+                     int src_stride_yuy2,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;

+  int x;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16u8 vec0, vec1, dst0, dst1;

+  for (x = 0; x < width; x += 32) {

+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);

+    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);

+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);

+    src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);

+    vec0 = __msa_aver_u_b(src0, src2);

+    vec1 = __msa_aver_u_b(src1, src3);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    src_yuy2 += 64;

+    src_yuy2_next += 64;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  for (x = 0; x < width; x += 32) {

+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);

+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    src_yuy2 += 64;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  for (x = 0; x < width; x += 32) {

+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);

+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    ST_UB2(dst0, dst1, dst_y, 16);

+    src_uyvy += 64;

+    dst_y += 32;

+  }

+}

+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,

+                     int src_stride_uyvy,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;

+  int x;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16u8 vec0, vec1, dst0, dst1;

+  for (x = 0; x < width; x += 32) {

+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);

+    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);

+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);

+    src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);

+    vec0 = __msa_aver_u_b(src0, src2);

+    vec1 = __msa_aver_u_b(src1, src3);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    src_uyvy += 64;

+    src_uyvy_next += 64;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  for (x = 0; x < width; x += 32) {

+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);

+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    src_uyvy += 64;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;

+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;

+  v16i8 zero = {0};

+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);

+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);

+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);

+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);

+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);

+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);

+    reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);

+    reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);

+    reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);

+    reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);

+    reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);

+    reg0 *= const_0x19;

+    reg1 *= const_0x19;

+    reg2 *= const_0x81;

+    reg3 *= const_0x81;

+    reg4 *= const_0x42;

+    reg5 *= const_0x42;

+    reg0 += reg2;

+    reg1 += reg3;

+    reg0 += reg4;

+    reg1 += reg5;

+    reg0 += const_0x1080;

+    reg1 += const_0x1080;

+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);

+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

+    ST_UB(dst0, dst_y);

+    src_argb0 += 64;

+    dst_y += 16;

+  }

+}

+void ARGBToUVRow_MSA(const uint8_t* src_argb0,

+                     int src_stride_argb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  int x;

+  const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;

+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;

+  v16u8 dst0, dst1;

+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);

+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);

+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);

+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);

+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  for (x = 0; x < width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);

+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);

+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);

+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);

+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);

+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);

+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);

+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);

+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);

+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);

+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);

+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);

+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);

+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);

+    reg0 = __msa_hadd_u_h(vec8, vec8);

+    reg1 = __msa_hadd_u_h(vec9, vec9);

+    reg2 = __msa_hadd_u_h(vec4, vec4);

+    reg3 = __msa_hadd_u_h(vec5, vec5);

+    reg4 = __msa_hadd_u_h(vec0, vec0);

+    reg5 = __msa_hadd_u_h(vec1, vec1);

+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);

+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);

+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);

+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);

+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);

+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);

+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);

+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);

+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);

+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);

+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);

+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);

+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);

+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);

+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);

+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);

+    reg0 += __msa_hadd_u_h(vec8, vec8);

+    reg1 += __msa_hadd_u_h(vec9, vec9);

+    reg2 += __msa_hadd_u_h(vec4, vec4);

+    reg3 += __msa_hadd_u_h(vec5, vec5);

+    reg4 += __msa_hadd_u_h(vec0, vec0);

+    reg5 += __msa_hadd_u_h(vec1, vec1);

+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);

+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);

+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);

+    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);

+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);

+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);

+    reg6 = reg0 * const_0x70;

+    reg7 = reg1 * const_0x70;

+    reg8 = reg2 * const_0x4A;

+    reg9 = reg3 * const_0x4A;

+    reg6 += const_0x8080;

+    reg7 += const_0x8080;

+    reg8 += reg4 * const_0x26;

+    reg9 += reg5 * const_0x26;

+    reg0 *= const_0x12;

+    reg1 *= const_0x12;

+    reg2 *= const_0x5E;

+    reg3 *= const_0x5E;

+    reg4 *= const_0x70;

+    reg5 *= const_0x70;

+    reg2 += reg0;

+    reg3 += reg1;

+    reg4 += const_0x8080;

+    reg5 += const_0x8080;

+    reg6 -= reg8;

+    reg7 -= reg9;

+    reg4 -= reg2;

+    reg5 -= reg3;

+    reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);

+    reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);

+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);

+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    src_argb0 += 128;

+    src_argb0_next += 128;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;

+  v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};

+  v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,

+                     16, 17, 18, 20, 21, 22, 24, 25};

+  v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,

+                     21, 22, 24, 25, 26, 28, 29, 30};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);

+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);

+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);

+    ST_UB2(dst0, dst1, dst_rgb, 16);

+    ST_UB(dst2, (dst_rgb + 32));

+    src_argb += 64;

+    dst_rgb += 48;

+  }

+}

+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;

+  v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};

+  v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,

+                     18, 17, 16, 22, 21, 20, 26, 25};

+  v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,

+                     21, 20, 26, 25, 24, 30, 29, 28};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);

+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);

+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);

+    ST_UB2(dst0, dst1, dst_rgb, 16);

+    ST_UB(dst2, (dst_rgb + 32));

+    src_argb += 64;

+    dst_rgb += 48;

+  }

+}

+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {

+  int x;

+  v16u8 src0, src1, dst0;

+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);

+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);

+    vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);

+    vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);

+    vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);

+    vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);

+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);

+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);

+    vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);

+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);

+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);

+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);

+    vec0 = __msa_binsli_b(vec0, vec1, 2);

+    vec1 = __msa_binsli_b(vec2, vec3, 4);

+    vec4 = __msa_binsli_b(vec4, vec5, 2);

+    vec5 = __msa_binsli_b(vec6, vec7, 4);

+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);

+    vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);

+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);

+    ST_UB(dst0, dst_rgb);

+    src_argb += 32;

+    dst_rgb += 16;

+  }

+}

+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,

+                           uint8_t* dst_rgb,

+                           int width) {

+  int x;

+  v16u8 src0, src1, dst0;

+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);

+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);

+    vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);

+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);

+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);

+    vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);

+    vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);

+    vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);

+    vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);

+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);

+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);

+    vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);

+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);

+    vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);

+    vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);

+    vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);

+    vec0 = __msa_binsli_b(vec0, vec1, 2);

+    vec5 = __msa_binsli_b(vec5, vec6, 2);

+    vec1 = __msa_binsli_b(vec2, vec3, 5);

+    vec6 = __msa_binsli_b(vec7, vec8, 5);

+    vec1 = __msa_binsli_b(vec1, vec4, 0);

+    vec6 = __msa_binsli_b(vec6, vec9, 0);

+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);

+    vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);

+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);

+    ST_UB(dst0, dst_rgb);

+    src_argb += 32;

+    dst_rgb += 16;

+  }

+}

+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,

+                           uint8_t* dst_rgb,

+                           int width) {

+  int x;

+  v16u8 src0, src1;

+  v16u8 vec0, vec1;

+  v16u8 dst0;

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);

+    vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);

+    src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);

+    src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);

+    vec0 = __msa_binsli_b(vec0, src0, 3);

+    vec1 = __msa_binsli_b(vec1, src1, 3);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_rgb);

+    src_argb += 32;

+    dst_rgb += 16;

+  }

+}

+void ARGBToUV444Row_MSA(const uint8_t* src_argb,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int32_t width) {

+  int32_t x;

+  v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v8u16 vec8, vec9, vec10, vec11;

+  v8u16 const_112 = (v8u16)__msa_ldi_h(112);

+  v8u16 const_74 = (v8u16)__msa_ldi_h(74);

+  v8u16 const_38 = (v8u16)__msa_ldi_h(38);

+  v8u16 const_94 = (v8u16)__msa_ldi_h(94);

+  v8u16 const_18 = (v8u16)__msa_ldi_h(18);

+  v8u16 const_32896 = (v8u16)__msa_fill_h(32896);

+  v16i8 zero = {0};

+  for (x = width; x > 0; x -= 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);

+    reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

+    src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);

+    src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);

+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);

+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);

+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);

+    vec10 = vec0 * const_18;

+    vec11 = vec1 * const_18;

+    vec8 = vec2 * const_94;

+    vec9 = vec3 * const_94;

+    vec6 = vec4 * const_112;

+    vec7 = vec5 * const_112;

+    vec0 *= const_112;

+    vec1 *= const_112;

+    vec2 *= const_74;

+    vec3 *= const_74;

+    vec4 *= const_38;

+    vec5 *= const_38;

+    vec8 += vec10;

+    vec9 += vec11;

+    vec6 += const_32896;

+    vec7 += const_32896;

+    vec0 += const_32896;

+    vec1 += const_32896;

+    vec2 += vec4;

+    vec3 += vec5;

+    vec0 -= vec2;

+    vec1 -= vec3;

+    vec6 -= vec8;

+    vec7 -= vec9;

+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);

+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);

+    vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);

+    vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    src_argb += 64;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,

+                         const uint8_t* src_argb1,

+                         uint8_t* dst_argb,

+                         int width) {

+  int x;

+  v16u8 src0, src1, dst0;

+  v8u16 vec0, vec1, vec2, vec3;

+  v4u32 reg0, reg1, reg2, reg3;

+  v8i16 zero = {0};

+  for (x = 0; x < width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);

+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);

+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);

+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);

+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);

+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);

+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);

+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);

+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);

+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);

+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);

+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);

+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_argb);

+    src_argb0 += 16;

+    src_argb1 += 16;

+    dst_argb += 16;

+  }

+}

+void ARGBAddRow_MSA(const uint8_t* src_argb0,

+                    const uint8_t* src_argb1,

+                    uint8_t* dst_argb,

+                    int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);

+    dst0 = __msa_adds_u_b(src0, src2);

+    dst1 = __msa_adds_u_b(src1, src3);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_argb0 += 32;

+    src_argb1 += 32;

+    dst_argb += 32;

+  }

+}

+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,

+                         const uint8_t* src_argb1,

+                         uint8_t* dst_argb,

+                         int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);

+    dst0 = __msa_subs_u_b(src0, src2);

+    dst1 = __msa_subs_u_b(src1, src3);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_argb0 += 32;

+    src_argb1 += 32;

+    dst_argb += 32;

+  }

+}

+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,

+                          uint8_t* dst_argb,

+                          int width) {

+  int x;

+  v16u8 src0, src1, dst0, dst1;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;

+  v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;

+  v8i16 zero = {0};

+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);

+    vec4 = (v8u16)__msa_fill_h(vec0[3]);

+    vec5 = (v8u16)__msa_fill_h(vec0[7]);

+    vec6 = (v8u16)__msa_fill_h(vec1[3]);

+    vec7 = (v8u16)__msa_fill_h(vec1[7]);

+    vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);

+    vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);

+    vec6 = (v8u16)__msa_fill_h(vec2[3]);

+    vec7 = (v8u16)__msa_fill_h(vec2[7]);

+    vec8 = (v8u16)__msa_fill_h(vec3[3]);

+    vec9 = (v8u16)__msa_fill_h(vec3[7]);

+    vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);

+    vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);

+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);

+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);

+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);

+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);

+    reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);

+    reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);

+    reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);

+    reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);

+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);

+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);

+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);

+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);

+    reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);

+    reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);

+    reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);

+    reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);

+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);

+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);

+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);

+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);

+    reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);

+    reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);

+    reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);

+    reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);

+    vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    dst0 = __msa_bmnz_v(dst0, src0, mask);

+    dst1 = __msa_bmnz_v(dst1, src1, mask);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_argb += 32;

+    dst_argb += 32;

+  }

+}

+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,

+                               uint8_t* dst_rgb,

+                               uint32_t dither4,

+                               int width) {

+  int x;

+  v16u8 src0, src1, dst0, vec0, vec1;

+  v8i16 vec_d0;

+  v8i16 reg0, reg1, reg2;

+  v16i8 zero = {0};

+  v8i16 max = __msa_ldi_h(0xFF);

+  vec_d0 = (v8i16)__msa_fill_w(dither4);

+  vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);

+    reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);

+    reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);

+    reg0 += vec_d0;

+    reg1 += vec_d0;

+    reg2 += vec_d0;

+    reg0 = __msa_maxi_s_h((v8i16)reg0, 0);

+    reg1 = __msa_maxi_s_h((v8i16)reg1, 0);

+    reg2 = __msa_maxi_s_h((v8i16)reg2, 0);

+    reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);

+    reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);

+    reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);

+    reg0 = __msa_srai_h(reg0, 3);

+    reg2 = __msa_srai_h(reg2, 3);

+    reg1 = __msa_srai_h(reg1, 2);

+    reg2 = __msa_slli_h(reg2, 11);

+    reg1 = __msa_slli_h(reg1, 5);

+    reg0 |= reg1;

+    dst0 = (v16u8)(reg0 | reg2);

+    ST_UB(dst0, dst_rgb);

+    src_argb += 32;

+    dst_rgb += 16;

+  }

+}

+void ARGBShuffleRow_MSA(const uint8_t* src_argb,

+                        uint8_t* dst_argb,

+                        const uint8_t* shuffler,

+                        int width) {

+  int x;

+  v16u8 src0, src1, dst0, dst1;

+  v16i8 vec0;

+  v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};

+  int32_t val = LW((int32_t*)shuffler);

+  vec0 = (v16i8)__msa_fill_w(val);

+  shuffler_vec += vec0;

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16);

+    dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);

+    dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_argb += 32;

+    dst_argb += 32;

+  }

+}

+void ARGBShadeRow_MSA(const uint8_t* src_argb,

+                      uint8_t* dst_argb,

+                      int width,

+                      uint32_t value) {

+  int x;

+  v16u8 src0, dst0;

+  v8u16 vec0, vec1;

+  v4u32 reg0, reg1, reg2, reg3, rgba_scale;

+  v8i16 zero = {0};

+  rgba_scale[0] = value;

+  rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);

+  rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);

+  for (x = 0; x < width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);

+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);

+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);

+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);

+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);

+    reg0 *= rgba_scale;

+    reg1 *= rgba_scale;

+    reg2 *= rgba_scale;

+    reg3 *= rgba_scale;

+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);

+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);

+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);

+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_argb);

+    src_argb += 16;

+    dst_argb += 16;

+  }

+}

+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {

+  int x;

+  v16u8 src0, src1, vec0, vec1, dst0, dst1;

+  v8u16 reg0;

+  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);

+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16);

+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);

+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);

+    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);

+    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);

+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);

+    vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);

+    vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_argb += 32;

+    dst_argb += 32;

+  }

+}

+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {

+  int x;

+  v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;

+  v8u16 reg0, reg1, reg2;

+  v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);

+  v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);

+  v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);

+  v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);

+  v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);

+  v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);

+  v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);

+    src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);

+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);

+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);

+    vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);

+    reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);

+    reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);

+    reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);

+    reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);

+    reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);

+    reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);

+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);

+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);

+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);

+    reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);

+    reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);

+    vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);

+    vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);

+    vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);

+    vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);

+    vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    dst_argb += 32;

+  }

+}

+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,

+                           uint8_t* dst_argb,

+                           int width) {

+  int x;

+  v16u8 src0, src1;

+  v8u16 vec0, vec1, vec2, vec3;

+  v16u8 dst0, dst1, dst2, dst3;

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 0);

+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 16);

+    vec0 = (v8u16)__msa_andi_b(src0, 0x0F);

+    vec1 = (v8u16)__msa_andi_b(src1, 0x0F);

+    vec2 = (v8u16)__msa_andi_b(src0, 0xF0);

+    vec3 = (v8u16)__msa_andi_b(src1, 0xF0);

+    vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);

+    vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);

+    vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);

+    vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);

+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);

+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_argb4444 += 32;

+    dst_argb += 64;

+  }

+}

+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,

+                           uint8_t* dst_argb,

+                           int width) {

+  int x;

+  v8u16 src0, src1;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;

+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;

+  v16u8 dst0, dst1, dst2, dst3;

+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 0);

+    src1 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 16);

+    vec0 = src0 & const_0x1F;

+    vec1 = src1 & const_0x1F;

+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);

+    vec2 = src0 & const_0x1F;

+    vec3 = src1 & const_0x1F;

+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);

+    vec4 = src0 & const_0x1F;

+    vec5 = src1 & const_0x1F;

+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);

+    reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);

+    reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);

+    reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);

+    reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);

+    reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);

+    reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);

+    reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);

+    reg3 = -reg3;

+    reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);

+    reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);

+    reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);

+    reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);

+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);

+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_argb1555 += 32;

+    dst_argb += 64;

+  }

+}

+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,

+                         uint8_t* dst_argb,

+                         int width) {

+  int x;

+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;

+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;

+  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);

+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);

+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 0);

+    src1 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 16);

+    vec0 = src0 & const_0x1F;

+    vec1 = src0 & const_0x7E0;

+    vec2 = src0 & const_0xF800;

+    vec3 = src1 & const_0x1F;

+    vec4 = src1 & const_0x7E0;

+    vec5 = src1 & const_0xF800;

+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);

+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);

+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);

+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);

+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);

+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);

+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);

+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);

+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);

+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);

+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);

+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);

+    res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);

+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);

+    res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);

+    res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);

+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);

+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_rgb565 += 32;

+    dst_argb += 64;

+  }

+}

+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,

+                        uint8_t* dst_argb,

+                        int width) {

+  int x;

+  v16u8 src0, src1, src2;

+  v16u8 vec0, vec1, vec2;

+  v16u8 dst0, dst1, dst2, dst3;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 32);

+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);

+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);

+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);

+    dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);

+    dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);

+    dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);

+    dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_rgb24 += 48;

+    dst_argb += 64;

+  }

+}

+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {

+  int x;

+  v16u8 src0, src1, src2;

+  v16u8 vec0, vec1, vec2;

+  v16u8 dst0, dst1, dst2, dst3;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32);

+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);

+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);

+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);

+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);

+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);

+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);

+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_raw += 48;

+    dst_argb += 64;

+  }

+}

+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,

+                        uint8_t* dst_y,

+                        int width) {

+  int x;

+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;

+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;

+  v16u8 dst0;

+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);

+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);

+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);

+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);

+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 0);

+    src1 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 16);

+    vec0 = src0 & const_0x1F;

+    vec1 = src1 & const_0x1F;

+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);

+    vec2 = src0 & const_0x1F;

+    vec3 = src1 & const_0x1F;

+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);

+    vec4 = src0 & const_0x1F;

+    vec5 = src1 & const_0x1F;

+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);

+    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);

+    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);

+    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);

+    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);

+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);

+    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);

+    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);

+    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);

+    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);

+    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);

+    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);

+    reg0 *= const_0x19;

+    reg1 *= const_0x19;

+    reg2 *= const_0x81;

+    reg3 *= const_0x81;

+    reg4 *= const_0x42;

+    reg5 *= const_0x42;

+    reg0 += reg2;

+    reg1 += reg3;

+    reg0 += reg4;

+    reg1 += reg5;

+    reg0 += const_0x1080;

+    reg1 += const_0x1080;

+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);

+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

+    ST_UB(dst0, dst_y);

+    src_argb1555 += 32;

+    dst_y += 16;

+  }

+}

+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {

+  int x;

+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;

+  v4u32 res0, res1, res2, res3;

+  v16u8 dst0;

+  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);

+  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);

+  v8i16 const_0x1080 = __msa_fill_h(0x1080);

+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);

+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);

+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 0);

+    src1 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 16);

+    vec0 = src0 & const_0x1F;

+    vec1 = src0 & const_0x7E0;

+    vec2 = src0 & const_0xF800;

+    vec3 = src1 & const_0x1F;

+    vec4 = src1 & const_0x7E0;

+    vec5 = src1 & const_0xF800;

+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);

+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);

+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);

+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);

+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);

+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);

+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);

+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);

+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);

+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);

+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);

+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);

+    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);

+    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);

+    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);

+    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);

+    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);

+    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);

+    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);

+    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);

+    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);

+    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);

+    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);

+    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);

+    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);

+    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);

+    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);

+    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);

+    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);

+    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);

+    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_y);

+    src_rgb565 += 32;

+    dst_y += 16;

+  }

+}

+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;

+  v8u16 vec0, vec1, vec2, vec3;

+  v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);

+  v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);

+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);

+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};

+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,

+                 18, 19, 20, 21, 21, 22, 23, 24};

+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};

+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);

+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);

+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);

+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);

+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);

+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);

+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);

+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);

+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);

+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);

+    vec0 += const_0x1080;

+    vec1 += const_0x1080;

+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);

+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_y);

+    src_argb0 += 48;

+    dst_y += 16;

+  }

+}

+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;

+  v8u16 vec0, vec1, vec2, vec3;

+  v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);

+  v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);

+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);

+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};

+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,

+                 18, 19, 20, 21, 21, 22, 23, 24};

+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};

+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);

+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);

+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);

+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);

+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);

+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);

+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);

+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);

+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);

+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);

+    vec0 += const_0x1080;

+    vec1 += const_0x1080;

+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);

+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_y);

+    src_argb0 += 48;

+    dst_y += 16;

+  }

+}

+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,

+                         int src_stride_argb1555,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

+                         int width) {

+  int x;

+  const uint16_t* s = (const uint16_t*)src_argb1555;

+  const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);

+  int64_t res0, res1;

+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;

+  v16u8 dst0;

+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);

+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);

+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);

+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);

+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);

+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);

+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);

+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);

+    vec0 = src0 & const_0x1F;

+    vec1 = src1 & const_0x1F;

+    vec0 += src2 & const_0x1F;

+    vec1 += src3 & const_0x1F;

+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);

+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);

+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);

+    vec2 = src0 & const_0x1F;

+    vec3 = src1 & const_0x1F;

+    vec2 += src2 & const_0x1F;

+    vec3 += src3 & const_0x1F;

+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);

+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);

+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);

+    vec4 = src0 & const_0x1F;

+    vec5 = src1 & const_0x1F;

+    vec4 += src2 & const_0x1F;

+    vec5 += src3 & const_0x1F;

+    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);

+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);

+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);

+    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);

+    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);

+    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);

+    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);

+    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);

+    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);

+    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);

+    reg0 = vec6 * const_0x70;

+    reg1 = vec0 * const_0x4A;

+    reg2 = vec2 * const_0x70;

+    reg3 = vec0 * const_0x5E;

+    reg0 += const_0x8080;

+    reg1 += vec2 * const_0x26;

+    reg2 += const_0x8080;

+    reg3 += vec6 * const_0x12;

+    reg0 -= reg1;

+    reg2 -= reg3;

+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);

+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);

+    res0 = __msa_copy_u_d((v2i64)dst0, 0);

+    res1 = __msa_copy_u_d((v2i64)dst0, 1);

+    SD(res0, dst_u);

+    SD(res1, dst_v);

+    s += 16;

+    t += 16;

+    dst_u += 8;

+    dst_v += 8;

+  }

+}

+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,

+                       int src_stride_rgb565,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  int x;

+  const uint16_t* s = (const uint16_t*)src_rgb565;

+  const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);

+  int64_t res0, res1;

+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;

+  v16u8 dst0;

+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);

+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);

+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);

+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);

+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);

+  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);

+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);

+  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);

+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);

+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);

+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);

+    vec0 = src0 & const_0x1F;

+    vec1 = src1 & const_0x1F;

+    vec0 += src2 & const_0x1F;

+    vec1 += src3 & const_0x1F;

+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);

+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);

+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);

+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);

+    vec2 = src0 & const_0x3F;

+    vec3 = src1 & const_0x3F;

+    vec2 += src2 & const_0x3F;

+    vec3 += src3 & const_0x3F;

+    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);

+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);

+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);

+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);

+    vec4 = src0 & const_0x1F;

+    vec5 = src1 & const_0x1F;

+    vec4 += src2 & const_0x1F;

+    vec5 += src3 & const_0x1F;

+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);

+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);

+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);

+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);

+    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);

+    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);

+    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);

+    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);

+    reg0 = vec3 * const_0x70;

+    reg1 = vec1 * const_0x4A;

+    reg2 = vec4 * const_0x70;

+    reg3 = vec1 * const_0x5E;

+    reg0 += const_32896;

+    reg1 += vec4 * const_0x26;

+    reg2 += const_32896;

+    reg3 += vec3 * const_0x12;

+    reg0 -= reg1;

+    reg2 -= reg3;

+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);

+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);

+    res0 = __msa_copy_u_d((v2i64)dst0, 0);

+    res1 = __msa_copy_u_d((v2i64)dst0, 1);

+    SD(res0, dst_u);

+    SD(res1, dst_v);

+    s += 16;

+    t += 16;

+    dst_u += 8;

+    dst_v += 8;

+  }

+}

+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,

+                      int src_stride_rgb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  int x;

+  const uint8_t* s = src_rgb0;

+  const uint8_t* t = src_rgb0 + src_stride_rgb;

+  int64_t res0, res1;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v8i16 reg0, reg1, reg2, reg3;

+  v16u8 dst0;

+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);

+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);

+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);

+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);

+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 16) {

+    inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);

+    inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);

+    inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);

+    inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0);

+    inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16);

+    inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32);

+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);

+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);

+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);

+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);

+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);

+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);

+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);

+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);

+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);

+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);

+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);

+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);

+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);

+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);

+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);

+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);

+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);

+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);

+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);

+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);

+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);

+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);

+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);

+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);

+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);

+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);

+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);

+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);

+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);

+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);

+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);

+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);

+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);

+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);

+    reg0 = __msa_srai_h((v8i16)reg0, 2);

+    reg1 = __msa_srai_h((v8i16)reg1, 2);

+    reg2 = __msa_srai_h((v8i16)reg2, 2);

+    reg3 = __msa_srai_h((v8i16)reg3, 2);

+    vec4 = (v8u16)__msa_pckev_h(reg1, reg0);

+    vec5 = (v8u16)__msa_pckev_h(reg3, reg2);

+    vec6 = (v8u16)__msa_pckod_h(reg1, reg0);

+    vec7 = (v8u16)__msa_pckod_h(reg3, reg2);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);

+    vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);

+    vec3 = vec0 * const_0x70;

+    vec4 = vec1 * const_0x4A;

+    vec5 = vec2 * const_0x26;

+    vec2 *= const_0x70;

+    vec1 *= const_0x5E;

+    vec0 *= const_0x12;

+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);

+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);

+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);

+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);

+    reg0 += reg1;

+    reg2 += reg3;

+    reg0 = __msa_srai_h(reg0, 8);

+    reg2 = __msa_srai_h(reg2, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);

+    res0 = __msa_copy_u_d((v2i64)dst0, 0);

+    res1 = __msa_copy_u_d((v2i64)dst0, 1);

+    SD(res0, dst_u);

+    SD(res1, dst_v);

+    t += 48;

+    s += 48;

+    dst_u += 8;

+    dst_v += 8;

+  }

+}

+void RAWToUVRow_MSA(const uint8_t* src_rgb0,

+                    int src_stride_rgb,

+                    uint8_t* dst_u,

+                    uint8_t* dst_v,

+                    int width) {

+  int x;

+  const uint8_t* s = src_rgb0;

+  const uint8_t* t = src_rgb0 + src_stride_rgb;

+  int64_t res0, res1;

+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v8i16 reg0, reg1, reg2, reg3;

+  v16u8 dst0;

+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);

+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);

+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);

+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);

+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 16) {

+    inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);

+    inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);

+    inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);

+    inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0);

+    inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16);

+    inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32);

+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);

+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);

+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);

+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);

+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);

+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);

+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);

+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);

+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);

+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);

+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);

+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);

+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);

+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);

+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);

+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);

+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);

+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);

+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);

+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);

+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);

+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);

+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);

+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);

+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);

+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);

+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);

+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);

+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);

+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);

+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);

+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);

+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);

+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);

+    reg0 = __msa_srai_h(reg0, 2);

+    reg1 = __msa_srai_h(reg1, 2);

+    reg2 = __msa_srai_h(reg2, 2);

+    reg3 = __msa_srai_h(reg3, 2);

+    vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);

+    vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);

+    vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);

+    vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);

+    vec3 = vec0 * const_0x70;

+    vec4 = vec1 * const_0x4A;

+    vec5 = vec2 * const_0x26;

+    vec2 *= const_0x70;

+    vec1 *= const_0x5E;

+    vec0 *= const_0x12;

+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);

+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);

+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);

+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);

+    reg0 += reg1;

+    reg2 += reg3;

+    reg0 = __msa_srai_h(reg0, 8);

+    reg2 = __msa_srai_h(reg2, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);

+    res0 = __msa_copy_u_d((v2i64)dst0, 0);

+    res1 = __msa_copy_u_d((v2i64)dst0, 1);

+    SD(res0, dst_u);

+    SD(res1, dst_v);

+    t += 48;

+    s += 48;

+    dst_u += 8;

+    dst_v += 8;

+  }

+}

+void NV12ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_uv,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width) {

+  int x;

+  uint64_t val0, val1;

+  v16u8 src0, src1, res0, res1, dst0, dst1;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 zero = {0};

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    val0 = LD(src_y);

+    val1 = LD(src_uv);

+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);

+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);

+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_y += 8;

+    src_uv += 8;

+    dst_argb += 32;

+  }

+}

+void NV12ToRGB565Row_MSA(const uint8_t* src_y,

+                         const uint8_t* src_uv,

+                         uint8_t* dst_rgb565,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

+  int x;

+  uint64_t val0, val1;

+  v16u8 src0, src1, dst0;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 zero = {0};

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    val0 = LD(src_y);

+    val1 = LD(src_uv);

+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);

+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    vec0 = vec0 >> 3;

+    vec1 = (vec1 >> 2) << 5;

+    vec2 = (vec2 >> 3) << 11;

+    dst0 = (v16u8)(vec0 | vec1 | vec2);

+    ST_UB(dst0, dst_rgb565);

+    src_y += 8;

+    src_uv += 8;

+    dst_rgb565 += 16;

+  }

+}

+void NV21ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_vu,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width) {

+  int x;

+  uint64_t val0, val1;

+  v16u8 src0, src1, res0, res1, dst0, dst1;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  v16u8 zero = {0};

+  v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    val0 = LD(src_y);

+    val1 = LD(src_vu);

+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);

+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);

+    src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);

+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);

+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_y += 8;

+    src_vu += 8;

+    dst_argb += 32;

+  }

+}

+void SobelRow_MSA(const uint8_t* src_sobelx,

+                  const uint8_t* src_sobely,

+                  uint8_t* dst_argb,

+                  int width) {

+  int x;

+  v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;

+  v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};

+  v16i8 const_0x4 = __msa_ldi_b(0x4);

+  v16i8 mask1 = mask0 + const_0x4;

+  v16i8 mask2 = mask1 + const_0x4;

+  v16i8 mask3 = mask2 + const_0x4;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);

+    vec0 = __msa_adds_u_b(src0, src1);

+    dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);

+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);

+    dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);

+    dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_sobelx += 16;

+    src_sobely += 16;

+    dst_argb += 64;

+  }

+}

+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,

+                         const uint8_t* src_sobely,

+                         uint8_t* dst_y,

+                         int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  for (x = 0; x < width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 16);

+    dst0 = __msa_adds_u_b(src0, src2);

+    dst1 = __msa_adds_u_b(src1, src3);

+    ST_UB2(dst0, dst1, dst_y, 16);

+    src_sobelx += 32;

+    src_sobely += 32;

+    dst_y += 32;

+  }

+}

+void SobelXYRow_MSA(const uint8_t* src_sobelx,

+                    const uint8_t* src_sobely,

+                    uint8_t* dst_argb,

+                    int width) {

+  int x;

+  v16u8 src0, src1, vec0, vec1, vec2;

+  v16u8 reg0, reg1, dst0, dst1, dst2, dst3;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);

+    vec0 = __msa_adds_u_b(src0, src1);

+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);

+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);

+    reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);

+    reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);

+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);

+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_sobelx += 16;

+    src_sobely += 16;

+    dst_argb += 64;

+  }

+}

+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0;

+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);

+  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);

+  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);

+    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,

+            dst0);

+    ST_UB(dst0, dst_y);

+    src_argb0 += 64;

+    dst_y += 16;

+  }

+}

+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0;

+  v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);

+  v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);

+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);

+    ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,

+            dst0);

+    ST_UB(dst0, dst_y);

+    src_argb0 += 64;

+    dst_y += 16;

+  }

+}

+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0;

+  v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);

+  v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);

+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);

+    ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,

+            dst0);

+    ST_UB(dst0, dst_y);

+    src_argb0 += 64;

+    dst_y += 16;

+  }

+}

+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0;

+  v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);

+  v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);

+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);

+    ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,

+            dst0);

+    ST_UB(dst0, dst_y);

+    src_argb0 += 64;

+    dst_y += 16;

+  }

+}

+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,

+                      int src_stride_rgb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  int x;

+  const uint8_t* s = src_rgb0;

+  const uint8_t* t = src_rgb0 + src_stride_rgb;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16u8 vec0, vec1, vec2, vec3;

+  v16u8 dst0, dst1;

+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};

+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,

+                     18, 19, 22, 23, 26, 27, 30, 31};

+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};

+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};

+  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);

+  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);

+  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  for (x = 0; x < width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)s, 48);

+    src4 = (v16u8)__msa_ld_b((const v16i8*)t, 0);

+    src5 = (v16u8)__msa_ld_b((const v16i8*)t, 16);

+    src6 = (v16u8)__msa_ld_b((const v16i8*)t, 32);

+    src7 = (v16u8)__msa_ld_b((const v16i8*)t, 48);

+    src0 = __msa_aver_u_b(src0, src4);

+    src1 = __msa_aver_u_b(src1, src5);

+    src2 = __msa_aver_u_b(src2, src6);

+    src3 = __msa_aver_u_b(src3, src7);

+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);

+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);

+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);

+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);

+    vec0 = __msa_aver_u_b(src4, src6);

+    vec1 = __msa_aver_u_b(src5, src7);

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);

+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);

+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);

+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);

+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);

+    src0 = __msa_aver_u_b(src0, src4);

+    src1 = __msa_aver_u_b(src1, src5);

+    src2 = __msa_aver_u_b(src2, src6);

+    src3 = __msa_aver_u_b(src3, src7);

+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);

+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);

+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);

+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);

+    vec2 = __msa_aver_u_b(src4, src6);

+    vec3 = __msa_aver_u_b(src5, src7);

+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,

+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,

+             dst1);

+    ST_UB(dst0, dst_v);

+    ST_UB(dst1, dst_u);

+    s += 128;

+    t += 128;

+    dst_v += 16;

+    dst_u += 16;

+  }

+}

+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,

+                     int src_stride_rgb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  int x;

+  const uint8_t* s = src_rgb0;

+  const uint8_t* t = src_rgb0 + src_stride_rgb;

+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;

+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};

+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,

+                     18, 19, 22, 23, 26, 27, 30, 31};

+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};

+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};

+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);

+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);

+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  for (x = 0; x < width; x += 32) {

+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);

+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,

+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,

+             dst1);

+    ST_UB(dst0, dst_v);

+    ST_UB(dst1, dst_u);

+    s += 128;

+    t += 128;

+    dst_v += 16;

+    dst_u += 16;

+  }

+}

+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,

+                     int src_stride_rgb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  int x;

+  const uint8_t* s = src_rgb0;

+  const uint8_t* t = src_rgb0 + src_stride_rgb;

+  v16u8 src0, src1, src2, src3;

+  v16u8 dst0, dst1;

+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};

+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,

+                     18, 19, 22, 23, 26, 27, 30, 31};

+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};

+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};

+  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);

+  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);

+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  for (x = 0; x < width; x += 32) {

+    READ_ARGB(s, t, src0, src1, src2, src3);

+    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,

+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,

+             dst1);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    s += 128;

+    t += 128;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,

+                     int src_stride_rgb,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  int x;

+  const uint8_t* s = src_rgb0;

+  const uint8_t* t = src_rgb0 + src_stride_rgb;

+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;

+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};

+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,

+                     18, 19, 22, 23, 26, 27, 30, 31};

+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};

+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};

+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);

+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);

+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);

+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);

+  for (x = 0; x < width; x += 32) {

+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);

+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,

+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,

+             dst1);

+    ST_UB(dst0, dst_u);

+    ST_UB(dst1, dst_v);

+    s += 128;

+    t += 128;

+    dst_u += 16;

+    dst_v += 16;

+  }

+}

+void I444ToARGBRow_MSA(const uint8_t* src_y,

+                       const uint8_t* src_u,

+                       const uint8_t* src_v,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width) {

+  int x;

+  v16u8 src0, src1, src2, dst0, dst1;

+  v8u16 vec0, vec1, vec2;

+  v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  v8i16 zero = {0};

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  for (x = 0; x < width; x += 8) {

+    READI444(src_y, src_u, src_v, src0, src1, src2);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);

+    reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);

+    reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);

+    reg0 *= vec_yg;

+    reg1 *= vec_yg;

+    reg0 = __msa_srai_w(reg0, 16);

+    reg1 = __msa_srai_w(reg1, 16);

+    reg4 = reg0 + vec_br;

+    reg5 = reg1 + vec_br;

+    reg2 = reg0 + vec_bg;

+    reg3 = reg1 + vec_bg;

+    reg0 += vec_bb;

+    reg1 += vec_bb;

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);

+    vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);

+    reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);

+    reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);

+    reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);

+    reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);

+    reg0 -= reg6 * vec_ub;

+    reg1 -= reg7 * vec_ub;

+    reg2 -= reg6 * vec_ug;

+    reg3 -= reg7 * vec_ug;

+    reg4 -= reg8 * vec_vr;

+    reg5 -= reg9 * vec_vr;

+    reg2 -= reg8 * vec_vg;

+    reg3 -= reg9 * vec_vg;

+    reg0 = __msa_srai_w(reg0, 6);

+    reg1 = __msa_srai_w(reg1, 6);

+    reg2 = __msa_srai_w(reg2, 6);

+    reg3 = __msa_srai_w(reg3, 6);

+    reg4 = __msa_srai_w(reg4, 6);

+    reg5 = __msa_srai_w(reg5, 6);

+    CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);

+    vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);

+    vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);

+    dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);

+    dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_y += 8;

+    src_u += 8;

+    src_v += 8;

+    dst_argb += 32;

+  }

+}

+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {

+  int x;

+  v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;

+  v8i16 vec0, vec1;

+  v4i32 reg0, reg1, reg2, reg3;

+  v4i32 vec_yg = __msa_fill_w(0x4A35);

+  v8i16 vec_ygb = __msa_fill_h(0xFB78);

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  v8i16 max = __msa_ldi_h(0xFF);

+  v8i16 zero = {0};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0);

+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);

+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);

+    reg0 = (v4i32)__msa_ilvr_h(zero, vec0);

+    reg1 = (v4i32)__msa_ilvl_h(zero, vec0);

+    reg2 = (v4i32)__msa_ilvr_h(zero, vec1);

+    reg3 = (v4i32)__msa_ilvl_h(zero, vec1);

+    reg0 *= vec_yg;

+    reg1 *= vec_yg;

+    reg2 *= vec_yg;

+    reg3 *= vec_yg;

+    reg0 = __msa_srai_w(reg0, 16);

+    reg1 = __msa_srai_w(reg1, 16);

+    reg2 = __msa_srai_w(reg2, 16);

+    reg3 = __msa_srai_w(reg3, 16);

+    vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    vec0 += vec_ygb;

+    vec1 += vec_ygb;

+    vec0 = __msa_srai_h(vec0, 6);

+    vec1 = __msa_srai_h(vec1, 6);

+    vec0 = __msa_maxi_s_h(vec0, 0);

+    vec1 = __msa_maxi_s_h(vec1, 0);

+    vec0 = __msa_min_s_h(max, vec0);

+    vec1 = __msa_min_s_h(max, vec1);

+    res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);

+    res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);

+    res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);

+    res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);

+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);

+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_y += 16;

+    dst_argb += 64;

+  }

+}

+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {

+  int x;

+  v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0);

+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);

+    vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);

+    vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);

+    vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);

+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);

+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    src_y += 16;

+    dst_argb += 64;

+  }

+}

+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width) {

+  int x;

+  v16u8 src0, src1, src2;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_yuy2, 0);

+    src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);

+    src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);

+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);

+    src_yuy2 += 16;

+    dst_argb += 32;

+  }

+}

+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,

+                       uint8_t* dst_argb,

+                       const struct YuvConstants* yuvconstants,

+                       int width) {

+  int x;

+  v16u8 src0, src1, src2;

+  v8i16 vec0, vec1, vec2;

+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;

+  v4i32 vec_ubvr, vec_ugvg;

+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);

+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,

+                 vec_br, vec_yg);

+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);

+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uyvy, 0);

+    src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);

+    src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);

+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,

+             vec0, vec1, vec2);

+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);

+    src_uyvy += 16;

+    dst_argb += 32;

+  }

+}

+void InterpolateRow_MSA(uint8_t* dst_ptr,

+                        const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        int width,

+                        int32_t source_y_fraction) {

+  int32_t y1_fraction = source_y_fraction;

+  int32_t y0_fraction = 256 - y1_fraction;

+  uint16_t y_fractions;

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  v8u16 vec0, vec1, vec2, vec3, y_frac;

+  if (0 == y1_fraction) {

+    memcpy(dst_ptr, src_ptr, width);

+    return;

+  }

+  if (128 == y1_fraction) {

+    for (x = 0; x < width; x += 32) {

+      src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);

+      src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);

+      src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0);

+      src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16);

+      dst0 = __msa_aver_u_b(src0, src2);

+      dst1 = __msa_aver_u_b(src1, src3);

+      ST_UB2(dst0, dst1, dst_ptr, 16);

+      s += 32;

+      t += 32;

+      dst_ptr += 32;

+    }

+    return;

+  }

+  y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));

+  y_frac = (v8u16)__msa_fill_h(y_fractions);

+  for (x = 0; x < width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

+    vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);

+    vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);

+    vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);

+    vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);

+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);

+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);

+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);

+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    ST_UB2(dst0, dst1, dst_ptr, 16);

+    s += 32;

+    t += 32;

+    dst_ptr += 32;

+  }

+}

+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {

+  int x;

+  v4i32 dst0 = __builtin_msa_fill_w(v32);

+  for (x = 0; x < width; x += 4) {

+    ST_UB(dst0, dst_argb);

+    dst_argb += 16;

+  }

+}

+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;

+  v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};

+  v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,

+                     18, 17, 16, 21, 20, 19, 24, 23};

+  v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,

+                     24, 23, 28, 27, 26, 31, 30, 29};

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32);

+    src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);

+    src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);

+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);

+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);

+    ST_UB2(dst0, dst1, dst_rgb24, 16);

+    ST_UB(dst2, (dst_rgb24 + 32));

+    src_raw += 48;

+    dst_rgb24 += 48;

+  }

+}

+void MergeUVRow_MSA(const uint8_t* src_u,

+                    const uint8_t* src_v,

+                    uint8_t* dst_uv,

+                    int width) {

+  int x;

+  v16u8 src0, src1, dst0, dst1;

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_u, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_v, 0);

+    dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);

+    ST_UB2(dst0, dst1, dst_uv, 16);

+    src_u += 16;

+    src_v += 16;

+    dst_uv += 32;

+  }

+}

+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,

+                             uint8_t* dst_a,

+                             int width) {

+  int i;

+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;

+  for (i = 0; i < width; i += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);

+    vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_a);

+    src_argb += 64;

+    dst_a += 16;

+  }

+}

+void ARGBBlendRow_MSA(const uint8_t* src_argb0,

+                      const uint8_t* src_argb1,

+                      uint8_t* dst_argb,

+                      int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v8u16 vec8, vec9, vec10, vec11, vec12, vec13;

+  v8u16 const_256 = (v8u16)__msa_ldi_h(256);

+  v16u8 const_255 = (v16u8)__msa_ldi_b(255);

+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);

+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);

+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);

+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);

+    vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);

+    vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);

+    vec8 = (v8u16)__msa_fill_h(vec0[3]);

+    vec9 = (v8u16)__msa_fill_h(vec0[7]);

+    vec10 = (v8u16)__msa_fill_h(vec1[3]);

+    vec11 = (v8u16)__msa_fill_h(vec1[7]);

+    vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);

+    vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);

+    vec10 = (v8u16)__msa_fill_h(vec2[3]);

+    vec11 = (v8u16)__msa_fill_h(vec2[7]);

+    vec12 = (v8u16)__msa_fill_h(vec3[3]);

+    vec13 = (v8u16)__msa_fill_h(vec3[7]);

+    vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);

+    vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);

+    vec8 = const_256 - vec8;

+    vec9 = const_256 - vec9;

+    vec10 = const_256 - vec10;

+    vec11 = const_256 - vec11;

+    vec8 *= vec4;

+    vec9 *= vec5;

+    vec10 *= vec6;

+    vec11 *= vec7;

+    vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);

+    vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);

+    vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);

+    vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);

+    vec0 += vec8;

+    vec1 += vec9;

+    vec2 += vec10;

+    vec3 += vec11;

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    dst0 = __msa_bmnz_v(dst0, const_255, mask);

+    dst1 = __msa_bmnz_v(dst1, const_255, mask);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_argb0 += 32;

+    src_argb1 += 32;

+    dst_argb += 32;

+  }

+}

+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,

+                         int scale,

+                         int interval_size,

+                         int interval_offset,

+                         int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;

+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;

+  v4i32 vec_scale = __msa_fill_w(scale);

+  v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);

+  v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);

+  v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};

+  v16i8 zero = {0};

+  for (x = 0; x < width; x += 8) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48);

+    vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);

+    vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);

+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);

+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);

+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);

+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);

+    vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);

+    vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);

+    tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);

+    tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);

+    tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);

+    tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);

+    tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);

+    tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);

+    tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);

+    tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);

+    tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);

+    tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);

+    tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);

+    tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);

+    tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);

+    tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);

+    tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);

+    tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);

+    tmp0 *= vec_scale;

+    tmp1 *= vec_scale;

+    tmp2 *= vec_scale;

+    tmp3 *= vec_scale;

+    tmp4 *= vec_scale;

+    tmp5 *= vec_scale;

+    tmp6 *= vec_scale;

+    tmp7 *= vec_scale;

+    tmp8 *= vec_scale;

+    tmp9 *= vec_scale;

+    tmp10 *= vec_scale;

+    tmp11 *= vec_scale;

+    tmp12 *= vec_scale;

+    tmp13 *= vec_scale;

+    tmp14 *= vec_scale;

+    tmp15 *= vec_scale;

+    tmp0 >>= 16;

+    tmp1 >>= 16;

+    tmp2 >>= 16;

+    tmp3 >>= 16;

+    tmp4 >>= 16;

+    tmp5 >>= 16;

+    tmp6 >>= 16;

+    tmp7 >>= 16;

+    tmp8 >>= 16;

+    tmp9 >>= 16;

+    tmp10 >>= 16;

+    tmp11 >>= 16;

+    tmp12 >>= 16;

+    tmp13 >>= 16;

+    tmp14 >>= 16;

+    tmp15 >>= 16;

+    vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);

+    vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);

+    vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);

+    vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);

+    vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);

+    vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);

+    vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);

+    dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);

+    dst0 *= vec_int_sz;

+    dst1 *= vec_int_sz;

+    dst2 *= vec_int_sz;

+    dst3 *= vec_int_sz;

+    dst0 += vec_int_ofst;

+    dst1 += vec_int_ofst;

+    dst2 += vec_int_ofst;

+    dst3 += vec_int_ofst;

+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);

+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);

+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);

+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);

+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);

+    dst_argb += 64;

+  }

+}

+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            const int8_t* matrix_argb,

+                            int width) {

+  int32_t x;

+  v16i8 src0;

+  v16u8 src1, src2, dst0, dst1;

+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;

+  v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;

+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;

+  v16i8 zero = {0};

+  v8i16 max = __msa_ldi_h(255);

+  src0 = __msa_ld_b((v16i8*)matrix_argb, 0);

+  vec0 = (v8i16)__msa_ilvr_b(zero, src0);

+  vec1 = (v8i16)__msa_ilvl_b(zero, src0);

+  for (x = 0; x < width; x += 8) {

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);

+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);

+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);

+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);

+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);

+    vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);

+    vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);

+    vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);

+    vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);

+    vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);

+    vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);

+    vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);

+    vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);

+    vec10 = vec2 * vec0;

+    vec11 = vec2 * vec1;

+    vec12 = vec6 * vec0;

+    vec13 = vec6 * vec1;

+    tmp0 = __msa_hadd_s_w(vec10, vec10);

+    tmp1 = __msa_hadd_s_w(vec11, vec11);

+    tmp2 = __msa_hadd_s_w(vec12, vec12);

+    tmp3 = __msa_hadd_s_w(vec13, vec13);

+    vec14 = vec3 * vec0;

+    vec15 = vec3 * vec1;

+    vec16 = vec7 * vec0;

+    vec17 = vec7 * vec1;

+    tmp4 = __msa_hadd_s_w(vec14, vec14);

+    tmp5 = __msa_hadd_s_w(vec15, vec15);

+    tmp6 = __msa_hadd_s_w(vec16, vec16);

+    tmp7 = __msa_hadd_s_w(vec17, vec17);

+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);

+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);

+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);

+    tmp0 = __msa_hadd_s_w(vec10, vec10);

+    tmp1 = __msa_hadd_s_w(vec11, vec11);

+    tmp2 = __msa_hadd_s_w(vec12, vec12);

+    tmp3 = __msa_hadd_s_w(vec13, vec13);

+    tmp0 = __msa_srai_w(tmp0, 6);

+    tmp1 = __msa_srai_w(tmp1, 6);

+    tmp2 = __msa_srai_w(tmp2, 6);

+    tmp3 = __msa_srai_w(tmp3, 6);

+    vec2 = vec4 * vec0;

+    vec6 = vec4 * vec1;

+    vec3 = vec8 * vec0;

+    vec7 = vec8 * vec1;

+    tmp8 = __msa_hadd_s_w(vec2, vec2);

+    tmp9 = __msa_hadd_s_w(vec6, vec6);

+    tmp10 = __msa_hadd_s_w(vec3, vec3);

+    tmp11 = __msa_hadd_s_w(vec7, vec7);

+    vec4 = vec5 * vec0;

+    vec8 = vec5 * vec1;

+    vec5 = vec9 * vec0;

+    vec9 = vec9 * vec1;

+    tmp12 = __msa_hadd_s_w(vec4, vec4);

+    tmp13 = __msa_hadd_s_w(vec8, vec8);

+    tmp14 = __msa_hadd_s_w(vec5, vec5);

+    tmp15 = __msa_hadd_s_w(vec9, vec9);

+    vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);

+    vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);

+    vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);

+    vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);

+    tmp4 = __msa_hadd_s_w(vec14, vec14);

+    tmp5 = __msa_hadd_s_w(vec15, vec15);

+    tmp6 = __msa_hadd_s_w(vec16, vec16);

+    tmp7 = __msa_hadd_s_w(vec17, vec17);

+    tmp4 = __msa_srai_w(tmp4, 6);

+    tmp5 = __msa_srai_w(tmp5, 6);

+    tmp6 = __msa_srai_w(tmp6, 6);

+    tmp7 = __msa_srai_w(tmp7, 6);

+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);

+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);

+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);

+    vec10 = __msa_maxi_s_h(vec10, 0);

+    vec11 = __msa_maxi_s_h(vec11, 0);

+    vec12 = __msa_maxi_s_h(vec12, 0);

+    vec13 = __msa_maxi_s_h(vec13, 0);

+    vec10 = __msa_min_s_h(vec10, max);

+    vec11 = __msa_min_s_h(vec11, max);

+    vec12 = __msa_min_s_h(vec12, max);

+    vec13 = __msa_min_s_h(vec13, max);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);

+    ST_UB2(dst0, dst1, dst_argb, 16);

+    src_argb += 32;

+    dst_argb += 32;

+  }

+}

+void SplitUVRow_MSA(const uint8_t* src_uv,

+                    uint8_t* dst_u,

+                    uint8_t* dst_v,

+                    int width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;

+  for (x = 0; x < width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    ST_UB2(dst0, dst1, dst_u, 16);

+    ST_UB2(dst2, dst3, dst_v, 16);

+    src_uv += 64;

+    dst_u += 32;

+    dst_v += 32;

+  }

+}

+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {

+  int x;

+  v16u8 dst0 = (v16u8)__msa_fill_b(v8);

+  for (x = 0; x < width; x += 16) {

+    ST_UB(dst0, dst);

+    dst += 16;

+  }

+}

+void MirrorUVRow_MSA(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  int x;

+  v16u8 src0, src1, src2, src3;

+  v16u8 dst0, dst1, dst2, dst3;

+  v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};

+  v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};

+  src_uv += (2 * width);

+  for (x = 0; x < width; x += 32) {

+    src_uv -= 64;

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16);

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48);

+    dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);

+    dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);

+    dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);

+    ST_UB2(dst0, dst1, dst_v, 16);

+    ST_UB2(dst2, dst3, dst_u, 16);

+    dst_u += 32;

+    dst_v += 32;

+  }

+}

+void SobelXRow_MSA(const uint8_t* src_y0,

+                   const uint8_t* src_y1,

+                   const uint8_t* src_y2,

+                   uint8_t* dst_sobelx,

+                   int32_t width) {

+  int x;

+  v16u8 src0, src1, src2, src3, src4, src5, dst0;

+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;

+  v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};

+  v16i8 tmp = __msa_ldi_b(8);

+  v16i8 mask1 = mask0 + tmp;

+  v8i16 zero = {0};

+  v8i16 max = __msa_ldi_h(255);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 16);

+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0);

+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 16);

+    src4 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 0);

+    src5 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 16);

+    vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);

+    vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);

+    vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);

+    vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);

+    vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);

+    vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);

+    vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);

+    vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);

+    vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);

+    vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);

+    vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);

+    vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);

+    vec0 += vec2;

+    vec1 += vec3;

+    vec4 += vec2;

+    vec5 += vec3;

+    vec0 += vec4;

+    vec1 += vec5;

+    vec0 = __msa_add_a_h(zero, vec0);

+    vec1 = __msa_add_a_h(zero, vec1);

+    vec0 = __msa_maxi_s_h(vec0, 0);

+    vec1 = __msa_maxi_s_h(vec1, 0);

+    vec0 = __msa_min_s_h(max, vec0);

+    vec1 = __msa_min_s_h(max, vec1);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_sobelx);

+    src_y0 += 16;

+    src_y1 += 16;

+    src_y2 += 16;

+    dst_sobelx += 16;

+  }

+}

+void SobelYRow_MSA(const uint8_t* src_y0,

+                   const uint8_t* src_y1,

+                   uint8_t* dst_sobely,

+                   int32_t width) {

+  int x;

+  v16u8 src0, src1, dst0;

+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;

+  v8i16 zero = {0};

+  v8i16 max = __msa_ldi_h(255);

+  for (x = 0; x < width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0);

+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0);

+    vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);

+    vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);

+    vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);

+    vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);

+    vec0 -= vec2;

+    vec1 -= vec3;

+    vec6[0] = src_y0[16] - src_y1[16];

+    vec6[1] = src_y0[17] - src_y1[17];

+    vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);

+    vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);

+    vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);

+    vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);

+    vec0 += vec2;

+    vec1 += vec3;

+    vec4 += vec2;

+    vec5 += vec3;

+    vec0 += vec4;

+    vec1 += vec5;

+    vec0 = __msa_add_a_h(zero, vec0);

+    vec1 = __msa_add_a_h(zero, vec1);

+    vec0 = __msa_maxi_s_h(vec0, 0);

+    vec1 = __msa_maxi_s_h(vec1, 0);

+    vec0 = __msa_min_s_h(max, vec0);

+    vec1 = __msa_min_s_h(max, vec1);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst_sobely);

+    src_y0 += 16;

+    src_y1 += 16;

+    dst_sobely += 16;

+  }

+}

+void HalfFloatRow_MSA(const uint16_t* src,

+                      uint16_t* dst,

+                      float scale,

+                      int width) {

+  int i;

+  v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;

+  v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;

+  v4f32 mult_vec;

+  v8i16 zero = {0};

+  mult_vec[0] = 1.9259299444e-34f * scale;

+  mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);

+  for (i = 0; i < width; i += 32) {

+    src0 = (v8u16)__msa_ld_h((v8i16*)src, 0);

+    src1 = (v8u16)__msa_ld_h((v8i16*)src, 16);

+    src2 = (v8u16)__msa_ld_h((v8i16*)src, 32);

+    src3 = (v8u16)__msa_ld_h((v8i16*)src, 48);

+    vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);

+    vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);

+    vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);

+    vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);

+    vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);

+    vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);

+    vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);

+    vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);

+    fvec0 = __msa_ffint_u_w(vec0);

+    fvec1 = __msa_ffint_u_w(vec1);

+    fvec2 = __msa_ffint_u_w(vec2);

+    fvec3 = __msa_ffint_u_w(vec3);

+    fvec4 = __msa_ffint_u_w(vec4);

+    fvec5 = __msa_ffint_u_w(vec5);

+    fvec6 = __msa_ffint_u_w(vec6);

+    fvec7 = __msa_ffint_u_w(vec7);

+    fvec0 *= mult_vec;

+    fvec1 *= mult_vec;

+    fvec2 *= mult_vec;

+    fvec3 *= mult_vec;

+    fvec4 *= mult_vec;

+    fvec5 *= mult_vec;

+    fvec6 *= mult_vec;

+    fvec7 *= mult_vec;

+    vec0 = ((v4u32)fvec0) >> 13;

+    vec1 = ((v4u32)fvec1) >> 13;

+    vec2 = ((v4u32)fvec2) >> 13;

+    vec3 = ((v4u32)fvec3) >> 13;

+    vec4 = ((v4u32)fvec4) >> 13;

+    vec5 = ((v4u32)fvec5) >> 13;

+    vec6 = ((v4u32)fvec6) >> 13;

+    vec7 = ((v4u32)fvec7) >> 13;

+    dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);

+    dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);

+    dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);

+    dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);

+    ST_UH2(dst0, dst1, dst, 8);

+    ST_UH2(dst2, dst3, dst + 16, 8);

+    src += 32;

+    dst += 32;

+  }

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

--- a/third_party/libyuv/source/row_neon.cc

+++ b/third_party/libyuv/source/row_neon.cc

@@ -10,6 +10,8 @@

 #include "libyuv/row.h"

+#include <stdio.h>

 #ifdef __cplusplus

 namespace libyuv {

 extern "C" {

@@ -20,1446 +22,1311 @@

     !defined(__aarch64__)

 // Read 8 Y, 4 U and 4 V from 422

-#define READYUV422                                                             \

-    MEMACCESS(0)                                                               \

-    "vld1.8     {d0}, [%0]!                    \n"                             \

-    MEMACCESS(1)                                                               \

-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \

-    MEMACCESS(2)                                                               \

-    "vld1.32    {d2[1]}, [%2]!                 \n"

+#define READYUV422                               \

+  "vld1.8     {d0}, [%0]!                    \n" \

+  "vld1.32    {d2[0]}, [%1]!                 \n" \

+  "vld1.32    {d2[1]}, [%2]!                 \n"

-// Read 8 Y, 2 U and 2 V from 422

-#define READYUV411                                                             \

-    MEMACCESS(0)                                                               \

-    "vld1.8     {d0}, [%0]!                    \n"                             \

-    MEMACCESS(1)                                                               \

-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \

-    MEMACCESS(2)                                                               \

-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \

-    "vmov.u8    d3, d2                         \n"                             \

-    "vzip.u8    d2, d3                         \n"

 // Read 8 Y, 8 U and 8 V from 444

-#define READYUV444                                                             \

-    MEMACCESS(0)                                                               \

-    "vld1.8     {d0}, [%0]!                    \n"                             \

-    MEMACCESS(1)                                                               \

-    "vld1.8     {d2}, [%1]!                    \n"                             \

-    MEMACCESS(2)                                                               \

-    "vld1.8     {d3}, [%2]!                    \n"                             \

-    "vpaddl.u8  q1, q1                         \n"                             \

-    "vrshrn.u16 d2, q1, #1                     \n"

+#define READYUV444                               \

+  "vld1.8     {d0}, [%0]!                    \n" \

+  "vld1.8     {d2}, [%1]!                    \n" \

+  "vld1.8     {d3}, [%2]!                    \n" \

+  "vpaddl.u8  q1, q1                         \n" \

+  "vrshrn.u16 d2, q1, #1                     \n"

 // Read 8 Y, and set 4 U and 4 V to 128

-#define READYUV400                                                             \

-    MEMACCESS(0)                                                               \

-    "vld1.8     {d0}, [%0]!                    \n"                             \

-    "vmov.u8    d2, #128                       \n"

+#define READYUV400                               \

+  "vld1.8     {d0}, [%0]!                    \n" \

+  "vmov.u8    d2, #128                       \n"

 // Read 8 Y and 4 UV from NV12

 #define READNV12                                                               \

-    MEMACCESS(0)                                                               \

-    "vld1.8     {d0}, [%0]!                    \n"                             \

-    MEMACCESS(1)                                                               \

-    "vld1.8     {d2}, [%1]!                    \n"                             \

-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

-    "vuzp.u8    d2, d3                         \n"                             \

-    "vtrn.u32   d2, d3                         \n"

+  "vld1.8     {d0}, [%0]!                    \n"                               \

+  "vld1.8     {d2}, [%1]!                    \n"                               \

+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \

+  "vuzp.u8    d2, d3                         \n"                               \

+  "vtrn.u32   d2, d3                         \n"

 // Read 8 Y and 4 VU from NV21

 #define READNV21                                                               \

-    MEMACCESS(0)                                                               \

-    "vld1.8     {d0}, [%0]!                    \n"                             \

-    MEMACCESS(1)                                                               \

-    "vld1.8     {d2}, [%1]!                    \n"                             \

-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\

-    "vuzp.u8    d3, d2                         \n"                             \

-    "vtrn.u32   d2, d3                         \n"

+  "vld1.8     {d0}, [%0]!                    \n"                               \

+  "vld1.8     {d2}, [%1]!                    \n"                               \

+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \

+  "vuzp.u8    d3, d2                         \n"                               \

+  "vtrn.u32   d2, d3                         \n"

 // Read 8 YUY2

-#define READYUY2                                                               \

-    MEMACCESS(0)                                                               \

-    "vld2.8     {d0, d2}, [%0]!                \n"                             \

-    "vmov.u8    d3, d2                         \n"                             \

-    "vuzp.u8    d2, d3                         \n"                             \

-    "vtrn.u32   d2, d3                         \n"

+#define READYUY2                                 \

+  "vld2.8     {d0, d2}, [%0]!                \n" \

+  "vmov.u8    d3, d2                         \n" \

+  "vuzp.u8    d2, d3                         \n" \

+  "vtrn.u32   d2, d3                         \n"

 // Read 8 UYVY

-#define READUYVY                                                               \

-    MEMACCESS(0)                                                               \

-    "vld2.8     {d2, d3}, [%0]!                \n"                             \

-    "vmov.u8    d0, d3                         \n"                             \

-    "vmov.u8    d3, d2                         \n"                             \

-    "vuzp.u8    d2, d3                         \n"                             \

-    "vtrn.u32   d2, d3                         \n"

+#define READUYVY                                 \

+  "vld2.8     {d2, d3}, [%0]!                \n" \

+  "vmov.u8    d0, d3                         \n" \

+  "vmov.u8    d3, d2                         \n" \

+  "vuzp.u8    d2, d3                         \n" \

+  "vtrn.u32   d2, d3                         \n"

-#define YUVTORGB_SETUP                                                         \

-    MEMACCESS([kUVToRB])                                                       \

-    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \

-    MEMACCESS([kUVToG])                                                        \

-    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \

-    MEMACCESS([kUVBiasBGR])                                                    \

-    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \

-    MEMACCESS([kUVBiasBGR])                                                    \

-    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \

-    MEMACCESS([kUVBiasBGR])                                                    \

-    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \

-    MEMACCESS([kYToRgb])                                                       \

-    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"

+#define YUVTORGB_SETUP                             \

+  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \

+  "vld1.8     {d25}, [%[kUVToG]]             \n"   \

+  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \

+  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \

+  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \

+  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"

-#define YUVTORGB                                                               \

-    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\

-    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\

-    "vmovl.u8   q0, d0                         \n" /* Y                      */\

-    "vmovl.s16  q10, d1                        \n"                             \

-    "vmovl.s16  q0, d0                         \n"                             \

-    "vmul.s32   q10, q10, q15                  \n"                             \

-    "vmul.s32   q0, q0, q15                    \n"                             \

-    "vqshrun.s32 d0, q0, #16                   \n"                             \

-    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\

-    "vadd.s16   d18, d19                       \n"                             \

-    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\

-    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\

-    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\

-    "vaddw.u16  q1, q1, d16                    \n"                             \

-    "vaddw.u16  q10, q10, d17                  \n"                             \

-    "vaddw.u16  q3, q3, d18                    \n"                             \

-    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \

-    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \

-    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \

-    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \

-    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \

-    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \

-    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \

-    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \

-    "vqshrun.s16 d21, q0, #6                   \n" /* G */

+#define YUVTORGB                                                              \

+  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \

+  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \

+  "vmovl.u8   q0, d0                         \n" /* Y                      */ \

+  "vmovl.s16  q10, d1                        \n"                              \

+  "vmovl.s16  q0, d0                         \n"                              \

+  "vmul.s32   q10, q10, q15                  \n"                              \

+  "vmul.s32   q0, q0, q15                    \n"                              \

+  "vqshrun.s32 d0, q0, #16                   \n"                              \

+  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \

+  "vadd.s16   d18, d19                       \n"                              \

+  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \

+  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \

+  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \

+  "vaddw.u16  q1, q1, d16                    \n"                              \

+  "vaddw.u16  q10, q10, d17                  \n"                              \

+  "vaddw.u16  q3, q3, d18                    \n"                              \

+  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \

+  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \

+  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \

+  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \

+  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \

+  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \

+  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \

+  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \

+  "vqshrun.s16 d21, q0, #6                   \n" /* G */

-void I444ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I444ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READYUV444

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    MEMACCESS(3)

-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_argb),  // %3

-      "+r"(width)      // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "vmov.u8    d23, #255                      \n"

+      "1:                                        \n" READYUV444 YUVTORGB

+      "subs       %4, %4, #8                     \n"

+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_argb),  // %3

+        "+r"(width)      // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-void I422ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    MEMACCESS(3)

-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_argb),  // %3

-      "+r"(width)      // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "vmov.u8    d23, #255                      \n"

+      "1:                                        \n" READYUV422 YUVTORGB

+      "subs       %4, %4, #8                     \n"

+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_argb),  // %3

+        "+r"(width)      // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-void I422AlphaToARGBRow_NEON(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             const uint8* src_a,

-                             uint8* dst_argb,

+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,

+                             const uint8_t* src_u,

+                             const uint8_t* src_v,

+                             const uint8_t* src_a,

+                             uint8_t* dst_argb,

                              const struct YuvConstants* yuvconstants,

                              int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB

-    "subs       %5, %5, #8                     \n"

-    MEMACCESS(3)

-    "vld1.8     {d23}, [%3]!                   \n"

-    MEMACCESS(4)

-    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(src_a),     // %3

-      "+r"(dst_argb),  // %4

-      "+r"(width)      // %5

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READYUV422 YUVTORGB

+      "subs       %5, %5, #8                     \n"

+      "vld1.8     {d23}, [%3]!                   \n"

+      "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(src_a),     // %3

+        "+r"(dst_argb),  // %4

+        "+r"(width)      // %5

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-void I411ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToRGBARow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_rgba,

                         const struct YuvConstants* yuvconstants,

                         int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READYUV411

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    MEMACCESS(3)

-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_argb),  // %3

-      "+r"(width)      // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READYUV422 YUVTORGB

+      "subs       %4, %4, #8                     \n"

+      "vmov.u8    d19, #255                      \n"  // YUVTORGB modified d19

+      "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_rgba),  // %3

+        "+r"(width)      // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-void I422ToRGBARow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_rgba,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB

-    MEMACCESS(3)

-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_rgba),  // %3

-      "+r"(width)      // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

-}

-void I422ToRGB24Row_NEON(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_rgb24,

+void I422ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_rgb24,

                          const struct YuvConstants* yuvconstants,

                          int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    MEMACCESS(3)

-    "vst3.8     {d20, d21, d22}, [%3]!         \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),      // %0

-      "+r"(src_u),      // %1

-      "+r"(src_v),      // %2

-      "+r"(dst_rgb24),  // %3

-      "+r"(width)       // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READYUV422 YUVTORGB

+      "subs       %4, %4, #8                     \n"

+      "vst3.8     {d20, d21, d22}, [%3]!         \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),      // %0

+        "+r"(src_u),      // %1

+        "+r"(src_v),      // %2

+        "+r"(dst_rgb24),  // %3

+        "+r"(width)       // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-#define ARGBTORGB565                                                           \

-    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \

-    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \

-    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \

-    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \

-    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */

+#define ARGBTORGB565                                                        \

+  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \

+  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \

+  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \

+  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \

+  "vsri.16     q0, q9, #11                   \n" /* RGB                  */

-void I422ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_u,

-                          const uint8* src_v,

-                          uint8* dst_rgb565,

+void I422ToRGB565Row_NEON(const uint8_t* src_y,

+                          const uint8_t* src_u,

+                          const uint8_t* src_v,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    ARGBTORGB565

-    MEMACCESS(3)

-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.

-    "bgt        1b                             \n"

-    : "+r"(src_y),    // %0

-      "+r"(src_u),    // %1

-      "+r"(src_v),    // %2

-      "+r"(dst_rgb565),  // %3

-      "+r"(width)     // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READYUV422 YUVTORGB

+      "subs       %4, %4, #8                     \n" ARGBTORGB565

+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.

+      "bgt        1b                             \n"

+      : "+r"(src_y),       // %0

+        "+r"(src_u),       // %1

+        "+r"(src_v),       // %2

+        "+r"(dst_rgb565),  // %3

+        "+r"(width)        // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-#define ARGBTOARGB1555                                                         \

-    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \

-    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \

-    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \

-    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \

-    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \

-    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \

-    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */

+#define ARGBTOARGB1555                                                      \

+  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \

+  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \

+  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \

+  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \

+  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \

+  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \

+  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */

-void I422ToARGB1555Row_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb1555,

+void I422ToARGB1555Row_NEON(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb1555,

                             const struct YuvConstants* yuvconstants,

                             int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

-    ARGBTOARGB1555

-    MEMACCESS(3)

-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.

-    "bgt        1b                             \n"

-    : "+r"(src_y),    // %0

-      "+r"(src_u),    // %1

-      "+r"(src_v),    // %2

-      "+r"(dst_argb1555),  // %3

-      "+r"(width)     // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READYUV422 YUVTORGB

+      "subs       %4, %4, #8                     \n"

+      "vmov.u8    d23, #255                      \n" ARGBTOARGB1555

+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels

+      "bgt        1b                             \n"

+      : "+r"(src_y),         // %0

+        "+r"(src_u),         // %1

+        "+r"(src_v),         // %2

+        "+r"(dst_argb1555),  // %3

+        "+r"(width)          // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-#define ARGBTOARGB4444                                                         \

-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \

-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \

-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \

-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \

-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \

-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \

-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */

+#define ARGBTOARGB4444                                                      \

+  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \

+  "vbic.32    d21, d21, d4                   \n" /* G                    */ \

+  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \

+  "vbic.32    d23, d23, d4                   \n" /* A                    */ \

+  "vorr       d0, d20, d21                   \n" /* BG                   */ \

+  "vorr       d1, d22, d23                   \n" /* RA                   */ \

+  "vzip.u8    d0, d1                         \n" /* BGRA                 */

-void I422ToARGB4444Row_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb4444,

+void I422ToARGB4444Row_NEON(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb4444,

                             const struct YuvConstants* yuvconstants,

                             int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB

-    "subs       %4, %4, #8                     \n"

-    "vmov.u8    d23, #255                      \n"

-    ARGBTOARGB4444

-    MEMACCESS(3)

-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.

-    "bgt        1b                             \n"

-    : "+r"(src_y),    // %0

-      "+r"(src_u),    // %1

-      "+r"(src_v),    // %2

-      "+r"(dst_argb4444),  // %3

-      "+r"(width)     // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "vmov.u8    d4, #0x0f                      \n"  // vbic bits to clear

+      "1:                                        \n"

+      READYUV422 YUVTORGB

+      "subs       %4, %4, #8                     \n"

+      "vmov.u8    d23, #255                      \n" ARGBTOARGB4444

+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels

+      "bgt        1b                             \n"

+      : "+r"(src_y),         // %0

+        "+r"(src_u),         // %1

+        "+r"(src_v),         // %2

+        "+r"(dst_argb4444),  // %3

+        "+r"(width)          // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-void I400ToARGBRow_NEON(const uint8* src_y,

-                        uint8* dst_argb,

-                        int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READYUV400

-    YUVTORGB

-    "subs       %2, %2, #8                     \n"

-    MEMACCESS(1)

-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(dst_argb),  // %1

-      "+r"(width)      // %2

-    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),

-      [kUVToG]"r"(&kYuvI601Constants.kUVToG),

-      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),

-      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {

+  asm volatile(

+      YUVTORGB_SETUP

+      "vmov.u8    d23, #255                      \n"

+      "1:                                        \n" READYUV400 YUVTORGB

+      "subs       %2, %2, #8                     \n"

+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),

+        [kUVToG] "r"(&kYuvI601Constants.kUVToG),

+        [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),

+        [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-void J400ToARGBRow_NEON(const uint8* src_y,

-                        uint8* dst_argb,

-                        int width) {

-  asm volatile (

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d20}, [%0]!                   \n"

-    "vmov       d21, d20                       \n"

-    "vmov       d22, d20                       \n"

-    "subs       %2, %2, #8                     \n"

-    MEMACCESS(1)

-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(dst_argb),  // %1

-      "+r"(width)      // %2

-    :

-    : "cc", "memory", "d20", "d21", "d22", "d23"

-  );

+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "vmov.u8    d23, #255                      \n"

+      "1:                                        \n"

+      "vld1.8     {d20}, [%0]!                   \n"

+      "vmov       d21, d20                       \n"

+      "vmov       d22, d20                       \n"

+      "subs       %2, %2, #8                     \n"

+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "d20", "d21", "d22", "d23");

-void NV12ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_uv,

-                        uint8* dst_argb,

+void NV12ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_uv,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READNV12

-    YUVTORGB

-    "subs       %3, %3, #8                     \n"

-    MEMACCESS(2)

-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_uv),    // %1

-      "+r"(dst_argb),  // %2

-      "+r"(width)      // %3

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(YUVTORGB_SETUP

+               "vmov.u8    d23, #255                      \n"

+               "1:                                        \n" READNV12 YUVTORGB

+               "subs       %3, %3, #8                     \n"

+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

+               "bgt        1b                             \n"

+               : "+r"(src_y),     // %0

+                 "+r"(src_uv),    // %1

+                 "+r"(dst_argb),  // %2

+                 "+r"(width)      // %3

+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+                 [kUVToG] "r"(&yuvconstants->kUVToG),

+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)

+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",

+                 "q10", "q11", "q12", "q13", "q14", "q15");

-void NV21ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_vu,

-                        uint8* dst_argb,

+void NV21ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_vu,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READNV21

-    YUVTORGB

-    "subs       %3, %3, #8                     \n"

-    MEMACCESS(2)

-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_vu),    // %1

-      "+r"(dst_argb),  // %2

-      "+r"(width)      // %3

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(YUVTORGB_SETUP

+               "vmov.u8    d23, #255                      \n"

+               "1:                                        \n" READNV21 YUVTORGB

+               "subs       %3, %3, #8                     \n"

+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"

+               "bgt        1b                             \n"

+               : "+r"(src_y),     // %0

+                 "+r"(src_vu),    // %1

+                 "+r"(dst_argb),  // %2

+                 "+r"(width)      // %3

+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+                 [kUVToG] "r"(&yuvconstants->kUVToG),

+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)

+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",

+                 "q10", "q11", "q12", "q13", "q14", "q15");

-void NV12ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_uv,

-                          uint8* dst_rgb565,

+void NV12ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_uv,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n"

+      READNV12 YUVTORGB

+      "subs       %3, %3, #8                     \n"

+      "vst3.8     {d20, d21, d22}, [%2]!         \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),      // %0

+        "+r"(src_uv),     // %1

+        "+r"(dst_rgb24),  // %2

+        "+r"(width)       // %3

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

+}

+void NV21ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_vu,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n"

+      READNV21 YUVTORGB

+      "subs       %3, %3, #8                     \n"

+      "vst3.8     {d20, d21, d22}, [%2]!         \n"

+      "bgt        1b                             \n"

+      : "+r"(src_y),      // %0

+        "+r"(src_vu),     // %1

+        "+r"(dst_rgb24),  // %2

+        "+r"(width)       // %3

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

+}

+void NV12ToRGB565Row_NEON(const uint8_t* src_y,

+                          const uint8_t* src_uv,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-  "1:                                          \n"

-    READNV12

-    YUVTORGB

-    "subs       %3, %3, #8                     \n"

-    ARGBTORGB565

-    MEMACCESS(2)

-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

-    "bgt        1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_uv),    // %1

-      "+r"(dst_rgb565),  // %2

-      "+r"(width)      // %3

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READNV12 YUVTORGB

+      "subs       %3, %3, #8                     \n" ARGBTORGB565

+      "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.

+      "bgt        1b                             \n"

+      : "+r"(src_y),       // %0

+        "+r"(src_uv),      // %1

+        "+r"(dst_rgb565),  // %2

+        "+r"(width)        // %3

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",

+        "q12", "q13", "q14", "q15");

-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

-                        uint8* dst_argb,

+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READYUY2

-    YUVTORGB

-    "subs       %2, %2, #8                     \n"

-    MEMACCESS(1)

-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_yuy2),  // %0

-      "+r"(dst_argb),  // %1

-      "+r"(width)      // %2

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(YUVTORGB_SETUP

+               "vmov.u8    d23, #255                      \n"

+               "1:                                        \n" READYUY2 YUVTORGB

+               "subs       %2, %2, #8                     \n"

+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+               "bgt        1b                             \n"

+               : "+r"(src_yuy2),  // %0

+                 "+r"(dst_argb),  // %1

+                 "+r"(width)      // %2

+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+                 [kUVToG] "r"(&yuvconstants->kUVToG),

+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)

+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",

+                 "q10", "q11", "q12", "q13", "q14", "q15");

-void UYVYToARGBRow_NEON(const uint8* src_uyvy,

-                        uint8* dst_argb,

+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "vmov.u8    d23, #255                      \n"

-  "1:                                          \n"

-    READUYVY

-    YUVTORGB

-    "subs       %2, %2, #8                     \n"

-    MEMACCESS(1)

-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

-    "bgt        1b                             \n"

-    : "+r"(src_uyvy),  // %0

-      "+r"(dst_argb),  // %1

-      "+r"(width)      // %2

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",

-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+  asm volatile(YUVTORGB_SETUP

+               "vmov.u8    d23, #255                      \n"

+               "1:                                        \n" READUYVY YUVTORGB

+               "subs       %2, %2, #8                     \n"

+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"

+               "bgt        1b                             \n"

+               : "+r"(src_uyvy),  // %0

+                 "+r"(dst_argb),  // %1

+                 "+r"(width)      // %2

+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+                 [kUVToG] "r"(&yuvconstants->kUVToG),

+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)

+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",

+                 "q10", "q11", "q12", "q13", "q14", "q15");

 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.

-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_NEON(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

                      int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV

-    "subs       %3, %3, #16                    \n"  // 16 processed per loop

-    MEMACCESS(1)

-    "vst1.8     {q0}, [%1]!                    \n"  // store U

-    MEMACCESS(2)

-    "vst1.8     {q1}, [%2]!                    \n"  // store V

-    "bgt        1b                             \n"

-    : "+r"(src_uv),  // %0

-      "+r"(dst_u),   // %1

-      "+r"(dst_v),   // %2

-      "+r"(width)    // %3  // Output registers

-    :                       // Input registers

-    : "cc", "memory", "q0", "q1"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV

+      "subs       %3, %3, #16                    \n"  // 16 processed per loop

+      "vst1.8     {q0}, [%1]!                    \n"  // store U

+      "vst1.8     {q1}, [%2]!                    \n"  // store V

+      "bgt        1b                             \n"

+      : "+r"(src_uv),               // %0

+        "+r"(dst_u),                // %1

+        "+r"(dst_v),                // %2

+        "+r"(width)                 // %3  // Output registers

+      :                             // Input registers

+      : "cc", "memory", "q0", "q1"  // Clobber List

+      );

 // Reads 16 U's and V's and writes out 16 pairs of UV.

-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_NEON(const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_uv,

                      int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load U

-    MEMACCESS(1)

-    "vld1.8     {q1}, [%1]!                    \n"  // load V

-    "subs       %3, %3, #16                    \n"  // 16 processed per loop

-    MEMACCESS(2)

-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV

-    "bgt        1b                             \n"

-    :

-      "+r"(src_u),   // %0

-      "+r"(src_v),   // %1

-      "+r"(dst_uv),  // %2

-      "+r"(width)    // %3  // Output registers

-    :                       // Input registers

-    : "cc", "memory", "q0", "q1"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load U

+      "vld1.8     {q1}, [%1]!                    \n"  // load V

+      "subs       %3, %3, #16                    \n"  // 16 processed per loop

+      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV

+      "bgt        1b                             \n"

+      : "+r"(src_u),                // %0

+        "+r"(src_v),                // %1

+        "+r"(dst_uv),               // %2

+        "+r"(width)                 // %3  // Output registers

+      :                             // Input registers

+      : "cc", "memory", "q0", "q1"  // Clobber List

+      );

+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.

+void SplitRGBRow_NEON(const uint8_t* src_rgb,

+                      uint8_t* dst_r,

+                      uint8_t* dst_g,

+                      uint8_t* dst_b,

+                      int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB

+      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB

+      "subs       %4, %4, #16                    \n"  // 16 processed per loop

+      "vst1.8     {q0}, [%1]!                    \n"  // store R

+      "vst1.8     {q1}, [%2]!                    \n"  // store G

+      "vst1.8     {q2}, [%3]!                    \n"  // store B

+      "bgt        1b                             \n"

+      : "+r"(src_rgb),                    // %0

+        "+r"(dst_r),                      // %1

+        "+r"(dst_g),                      // %2

+        "+r"(dst_b),                      // %3

+        "+r"(width)                       // %4

+      :                                   // Input registers

+      : "cc", "memory", "d0", "d1", "d2"  // Clobber List

+      );

+}

+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time

+void MergeRGBRow_NEON(const uint8_t* src_r,

+                      const uint8_t* src_g,

+                      const uint8_t* src_b,

+                      uint8_t* dst_rgb,

+                      int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load R

+      "vld1.8     {q1}, [%1]!                    \n"  // load G

+      "vld1.8     {q2}, [%2]!                    \n"  // load B

+      "subs       %4, %4, #16                    \n"  // 16 processed per loop

+      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB

+      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB

+      "bgt        1b                             \n"

+      : "+r"(src_r),                      // %0

+        "+r"(src_g),                      // %1

+        "+r"(src_b),                      // %2

+        "+r"(dst_rgb),                    // %3

+        "+r"(width)                       // %4

+      :                                   // Input registers

+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List

+      );

+}

 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.

-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32

-    "subs       %2, %2, #32                    \n"  // 32 processed per loop

-    MEMACCESS(1)

-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32

-    "bgt        1b                             \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(count)  // %2  // Output registers

-  :                     // Input registers

-  : "cc", "memory", "q0", "q1"  // Clobber List

-  );

+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32

+      "subs       %2, %2, #32                    \n"  // 32 processed per loop

+      "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32

+      "bgt        1b                             \n"

+      : "+r"(src),                  // %0

+        "+r"(dst),                  // %1

+        "+r"(width)                 // %2  // Output registers

+      :                             // Input registers

+      : "cc", "memory", "q0", "q1"  // Clobber List

+      );

-// SetRow writes 'count' bytes using an 8 bit value repeated.

-void SetRow_NEON(uint8* dst, uint8 v8, int count) {

-  asm volatile (

-    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes

-  "1:                                          \n"

-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop

-    MEMACCESS(0)

-    "vst1.8    {q0}, [%0]!                     \n"  // store

-    "bgt       1b                              \n"

-  : "+r"(dst),   // %0

-    "+r"(count)  // %1

-  : "r"(v8)      // %2

-  : "cc", "memory", "q0"

-  );

+// SetRow writes 'width' bytes using an 8 bit value repeated.

+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {

+  asm volatile(

+      "vdup.8    q0, %2                          \n"  // duplicate 16 bytes

+      "1:                                        \n"

+      "subs      %1, %1, #16                     \n"  // 16 bytes per loop

+      "vst1.8    {q0}, [%0]!                     \n"  // store

+      "bgt       1b                              \n"

+      : "+r"(dst),   // %0

+        "+r"(width)  // %1

+      : "r"(v8)      // %2

+      : "cc", "memory", "q0");

-// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.

-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {

-  asm volatile (

-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints

-  "1:                                          \n"

-    "subs      %1, %1, #4                      \n"  // 4 pixels per loop

-    MEMACCESS(0)

-    "vst1.8    {q0}, [%0]!                     \n"  // store

-    "bgt       1b                              \n"

-  : "+r"(dst),   // %0

-    "+r"(count)  // %1

-  : "r"(v32)     // %2

-  : "cc", "memory", "q0"

-  );

+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.

+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {

+  asm volatile(

+      "vdup.u32  q0, %2                          \n"  // duplicate 4 ints

+      "1:                                        \n"

+      "subs      %1, %1, #4                      \n"  // 4 pixels per loop

+      "vst1.8    {q0}, [%0]!                     \n"  // store

+      "bgt       1b                              \n"

+      : "+r"(dst),   // %0

+        "+r"(width)  // %1

+      : "r"(v32)     // %2

+      : "cc", "memory", "q0");

-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    // Start at end of source row.

-    "mov        r3, #-16                       \n"

-    "add        %0, %0, %2                     \n"

-    "sub        %0, #16                        \n"

+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      // Start at end of source row.

+      "mov        r3, #-16                       \n"

+      "add        %0, %0, %2                     \n"

+      "sub        %0, #16                        \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

-    "subs       %2, #16                        \n"  // 16 pixels per loop.

-    "vrev64.8   q0, q0                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  :

-  : "cc", "memory", "r3", "q0"

-  );

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

+      "subs       %2, #16                        \n"  // 16 pixels per loop.

+      "vrev64.8   q0, q0                         \n"

+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+      "vst1.8     {d0}, [%1]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "cc", "memory", "r3", "q0");

-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void MirrorUVRow_NEON(const uint8_t* src_uv,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

                       int width) {

-  asm volatile (

-    // Start at end of source row.

-    "mov        r12, #-16                      \n"

-    "add        %0, %0, %3, lsl #1             \n"

-    "sub        %0, #16                        \n"

+  asm volatile(

+      // Start at end of source row.

+      "mov        r12, #-16                      \n"

+      "add        %0, %0, %3, lsl #1             \n"

+      "sub        %0, #16                        \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16

-    "subs       %3, #8                         \n"  // 8 pixels per loop.

-    "vrev64.8   q0, q0                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8

-    MEMACCESS(2)

-    "vst1.8     {d1}, [%2]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src_uv),  // %0

-    "+r"(dst_u),   // %1

-    "+r"(dst_v),   // %2

-    "+r"(width)    // %3

-  :

-  : "cc", "memory", "r12", "q0"

-  );

+      "1:                                        \n"

+      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16

+      "subs       %3, #8                         \n"  // 8 pixels per loop.

+      "vrev64.8   q0, q0                         \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // dst += 8

+      "vst1.8     {d1}, [%2]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_uv),  // %0

+        "+r"(dst_u),   // %1

+        "+r"(dst_v),   // %2

+        "+r"(width)    // %3

+      :

+      : "cc", "memory", "r12", "q0");

-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    // Start at end of source row.

-    "mov        r3, #-16                       \n"

-    "add        %0, %0, %2, lsl #2             \n"

-    "sub        %0, #16                        \n"

+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      // Start at end of source row.

+      "mov        r3, #-16                       \n"

+      "add        %0, %0, %2, lsl #2             \n"

+      "sub        %0, #16                        \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

-    "subs       %2, #4                         \n"  // 4 pixels per loop.

-    "vrev64.32  q0, q0                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  :

-  : "cc", "memory", "r3", "q0"

-  );

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16

+      "subs       %2, #4                         \n"  // 4 pixels per loop.

+      "vrev64.32  q0, q0                         \n"

+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16

+      "vst1.8     {d0}, [%1]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "cc", "memory", "r3", "q0");

-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {

-  asm volatile (

-    "vmov.u8    d4, #255                       \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    MEMACCESS(1)

-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(src_rgb24),  // %0

-    "+r"(dst_argb),   // %1

-    "+r"(width)         // %2

-  :

-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

-  );

+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,

+                         uint8_t* dst_argb,

+                         int width) {

+  asm volatile(

+      "vmov.u8    d4, #255                       \n"  // Alpha

+      "1:                                        \n"

+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(src_rgb24),  // %0

+        "+r"(dst_argb),   // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+      );

-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {

-  asm volatile (

-    "vmov.u8    d4, #255                       \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vswp.u8    d1, d3                         \n"  // swap R, B

-    MEMACCESS(1)

-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(src_raw),   // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)      // %2

-  :

-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

-  );

+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "vmov.u8    d4, #255                       \n"  // Alpha

+      "1:                                        \n"

+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vswp.u8    d1, d3                         \n"  // swap R, B

+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(src_raw),   // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+      );

-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vswp.u8    d1, d3                         \n"  // swap R, B

-    MEMACCESS(1)

-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.

-    "bgt        1b                             \n"

-  : "+r"(src_raw),    // %0

-    "+r"(dst_rgb24),  // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "d1", "d2", "d3"  // Clobber List

-  );

+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vswp.u8    d1, d3                         \n"  // swap R, B

+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of

+                                                      // RGB24.

+      "bgt        1b                             \n"

+      : "+r"(src_raw),    // %0

+        "+r"(dst_rgb24),  // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "d1", "d2", "d3"  // Clobber List

+      );

-#define RGB565TOARGB                                                           \

-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \

-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \

-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \

-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \

-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \

-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \

-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \

-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \

-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \

-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

+#define RGB565TOARGB                                                        \

+  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \

+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \

+  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \

+  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \

+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \

+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \

+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \

+  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \

+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \

+  "vorr.u8    d1, d4, d6                     \n" /* G                    */

-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {

-  asm volatile (

-    "vmov.u8    d3, #255                       \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    RGB565TOARGB

-    MEMACCESS(1)

-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(src_rgb565),  // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

-  );

+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      "vmov.u8    d3, #255                       \n"  // Alpha

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      RGB565TOARGB

+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(src_rgb565),  // %0

+        "+r"(dst_argb),    // %1

+        "+r"(width)        // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+      );

-#define ARGB1555TOARGB                                                         \

-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \

-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \

-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \

-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \

-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \

-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \

-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \

-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \

-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \

-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \

-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \

-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \

+#define ARGB1555TOARGB                                                      \

+  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \

+  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \

+  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \

+  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \

+  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \

+  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \

+  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \

+  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \

+  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \

+  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \

+  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \

+  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */

 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.

-#define RGB555TOARGB                                                           \

-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \

-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \

-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \

-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \

-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \

-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \

-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \

-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \

-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \

-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */

+#define RGB555TOARGB                                                        \

+  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \

+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \

+  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \

+  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \

+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \

+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \

+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \

+  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \

+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \

+  "vorr.u8    d1, d4, d6                     \n" /* G                    */

-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,

+                            uint8_t* dst_argb,

                             int width) {

-  asm volatile (

-    "vmov.u8    d3, #255                       \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    ARGB1555TOARGB

-    MEMACCESS(1)

-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(src_argb1555),  // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

-  );

+  asm volatile(

+      "vmov.u8    d3, #255                       \n"  // Alpha

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      ARGB1555TOARGB

+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(src_argb1555),  // %0

+        "+r"(dst_argb),      // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+      );

-#define ARGB4444TOARGB                                                         \

-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \

-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \

-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \

-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \

-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \

-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \

-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \

-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */

+#define ARGB4444TOARGB                                                      \

+  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \

+  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \

+  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \

+  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \

+  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \

+  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \

+  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \

+  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */

-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,

+                            uint8_t* dst_argb,

                             int width) {

-  asm volatile (

-    "vmov.u8    d3, #255                       \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    ARGB4444TOARGB

-    MEMACCESS(1)

-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(src_argb4444),  // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List

-  );

+  asm volatile(

+      "vmov.u8    d3, #255                       \n"  // Alpha

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      ARGB4444TOARGB

+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(src_argb4444),  // %0

+        "+r"(dst_argb),      // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List

+      );

-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    MEMACCESS(1)

-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_rgb24),  // %1

-    "+r"(width)         // %2

-  :

-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

-  );

+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_rgb24,

+                         int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of

+                                                      // RGB24.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),   // %0

+        "+r"(dst_rgb24),  // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+      );

-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vswp.u8    d1, d3                         \n"  // swap R, B

-    MEMACCESS(1)

-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_raw),   // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

-  );

+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vswp.u8    d1, d3                         \n"  // swap R, B

+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_raw),   // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List

+      );

-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.

-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.

-    MEMACCESS(1)

-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.

-    "bgt        1b                             \n"

-  : "+r"(src_yuy2),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "q0", "q1"  // Clobber List

-  );

+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.

+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+      "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.

+      "bgt        1b                             \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "q0", "q1"  // Clobber List

+      );

-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.

-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.

-    MEMACCESS(1)

-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.

-    "bgt        1b                             \n"

-  : "+r"(src_uyvy),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "q0", "q1"  // Clobber List

-  );

+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.

+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.

+      "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.

+      "bgt        1b                             \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "q0", "q1"  // Clobber List

+      );

-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,

+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.

-    MEMACCESS(2)

-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.

-    "bgt        1b                             \n"

-  : "+r"(src_yuy2),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+      "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.

+      "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.

+      "bgt        1b                             \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

+      );

-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,

+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.

-    MEMACCESS(2)

-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.

-    "bgt        1b                             \n"

-  : "+r"(src_uyvy),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.

+      "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.

+      "bgt        1b                             \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List

+      );

-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "add        %1, %0, %1                     \n"  // stride + src_yuy2

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.

-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U

-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V

-    MEMACCESS(2)

-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.

-    MEMACCESS(3)

-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.

-    "bgt        1b                             \n"

-  : "+r"(src_yuy2),     // %0

-    "+r"(stride_yuy2),  // %1

-    "+r"(dst_u),        // %2

-    "+r"(dst_v),        // %3

-    "+r"(width)           // %4

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

-  );

+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,

+                      int stride_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  asm volatile(

+      "add        %1, %0, %1                     \n"  // stride + src_yuy2

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.

+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.

+      "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U

+      "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V

+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.

+      "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.

+      "bgt        1b                             \n"

+      : "+r"(src_yuy2),     // %0

+        "+r"(stride_yuy2),  // %1

+        "+r"(dst_u),        // %2

+        "+r"(dst_v),        // %3

+        "+r"(width)         // %4

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",

+        "d7"  // Clobber List

+      );

-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "add        %1, %0, %1                     \n"  // stride + src_uyvy

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.

-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U

-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V

-    MEMACCESS(2)

-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.

-    MEMACCESS(3)

-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.

-    "bgt        1b                             \n"

-  : "+r"(src_uyvy),     // %0

-    "+r"(stride_uyvy),  // %1

-    "+r"(dst_u),        // %2

-    "+r"(dst_v),        // %3

-    "+r"(width)           // %4

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List

-  );

+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  asm volatile(

+      "add        %1, %0, %1                     \n"  // stride + src_uyvy

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.

+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.

+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.

+      "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U

+      "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V

+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.

+      "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.

+      "bgt        1b                             \n"

+      : "+r"(src_uyvy),     // %0

+        "+r"(stride_uyvy),  // %1

+        "+r"(dst_u),        // %2

+        "+r"(dst_v),        // %3

+        "+r"(width)         // %4

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",

+        "d7"  // Clobber List

+      );

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width) {

-  asm volatile (

-    MEMACCESS(3)

-    "vld1.8     {q2}, [%3]                     \n"  // shuffler

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.

-    "subs       %2, %2, #4                     \n"  // 4 processed per loop

-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels

-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels

-    MEMACCESS(1)

-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)        // %2

-  : "r"(shuffler)    // %3

-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List

-  );

+void ARGBShuffleRow_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_argb,

+                         const uint8_t* shuffler,

+                         int width) {

+  asm volatile(

+      "vld1.8     {q2}, [%3]                     \n"  // shuffler

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.

+      "subs       %2, %2, #4                     \n"  // 4 processed per loop

+      "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels

+      "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels

+      "vst1.8     {q1}, [%1]!                    \n"  // store 4.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),                   // %0

+        "+r"(dst_argb),                   // %1

+        "+r"(width)                       // %2

+      : "r"(shuffler)                     // %3

+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List

+      );

-void I422ToYUY2Row_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_yuy2, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys

-    MEMACCESS(1)

-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us

-    MEMACCESS(2)

-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs

-    "subs       %4, %4, #16                    \n"  // 16 pixels

-    MEMACCESS(3)

-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_y),     // %0

-    "+r"(src_u),     // %1

-    "+r"(src_v),     // %2

-    "+r"(dst_yuy2),  // %3

-    "+r"(width)      // %4

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3"

-  );

+void I422ToYUY2Row_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_yuy2,

+                        int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys

+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us

+      "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs

+      "subs       %4, %4, #16                    \n"  // 16 pixels

+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_yuy2),  // %3

+        "+r"(width)      // %4

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3");

-void I422ToUYVYRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_uyvy, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys

-    MEMACCESS(1)

-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us

-    MEMACCESS(2)

-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs

-    "subs       %4, %4, #16                    \n"  // 16 pixels

-    MEMACCESS(3)

-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_y),     // %0

-    "+r"(src_u),     // %1

-    "+r"(src_v),     // %2

-    "+r"(dst_uyvy),  // %3

-    "+r"(width)      // %4

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3"

-  );

+void I422ToUYVYRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_uyvy,

+                        int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys

+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us

+      "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs

+      "subs       %4, %4, #16                    \n"  // 16 pixels

+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_uyvy),  // %3

+        "+r"(width)      // %4

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3");

-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    ARGBTORGB565

-    MEMACCESS(1)

-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_rgb565),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

-  );

+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,

+                          uint8_t* dst_rgb565,

+                          int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      ARGBTORGB565

+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),    // %0

+        "+r"(dst_rgb565),  // %1

+        "+r"(width)        // %2

+      :

+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");

-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int width) {

-  asm volatile (

-    "vdup.32    d2, %2                         \n"  // dither4

-  "1:                                          \n"

-    MEMACCESS(1)

-    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vqadd.u8   d20, d20, d2                   \n"

-    "vqadd.u8   d21, d21, d2                   \n"

-    "vqadd.u8   d22, d22, d2                   \n"

-    ARGBTORGB565

-    MEMACCESS(0)

-    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.

-    "bgt        1b                             \n"

-  : "+r"(dst_rgb)    // %0

-  : "r"(src_argb),   // %1

-    "r"(dither4),    // %2

-    "r"(width)       // %3

-  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"

-  );

+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,

+                                uint8_t* dst_rgb,

+                                const uint32_t dither4,

+                                int width) {

+  asm volatile(

+      "vdup.32    d2, %2                         \n"  // dither4

+      "1:                                        \n"

+      "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vqadd.u8   d20, d20, d2                   \n"

+      "vqadd.u8   d21, d21, d2                   \n"

+      "vqadd.u8   d22, d22, d2                   \n"  // add for dither

+      ARGBTORGB565

+      "vst1.8     {q0}, [%0]!                    \n"  // store 8 RGB565.

+      "bgt        1b                             \n"

+      : "+r"(dst_rgb)   // %0

+      : "r"(src_argb),  // %1

+        "r"(dither4),   // %2

+        "r"(width)      // %3

+      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");

-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,

+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,

+                            uint8_t* dst_argb1555,

                             int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    ARGBTOARGB1555

-    MEMACCESS(1)

-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb1555),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

-  );

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      ARGBTOARGB1555

+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB1555.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),      // %0

+        "+r"(dst_argb1555),  // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");

-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,

+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,

+                            uint8_t* dst_argb4444,

                             int width) {

-  asm volatile (

-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    ARGBTOARGB4444

-    MEMACCESS(1)

-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),      // %0

-    "+r"(dst_argb4444),  // %1

-    "+r"(width)            // %2

-  :

-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"

-  );

+  asm volatile(

+      "vmov.u8    d4, #0x0f                      \n"  // bits to clear with

+                                                      // vbic.

+      "1:                                        \n"

+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      ARGBTOARGB4444

+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB4444.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),      // %0

+        "+r"(dst_argb4444),  // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");

-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

-    "vmov.u8    d27, #16                       \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q2, d0, d24                    \n"  // B

-    "vmlal.u8   q2, d1, d25                    \n"  // G

-    "vmlal.u8   q2, d2, d26                    \n"  // R

-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d27                        \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

-  );

+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+      "vmov.u8    d27, #16                       \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q2, d0, d24                    \n"  // B

+      "vmlal.u8   q2, d1, d25                    \n"  // G

+      "vmlal.u8   q2, d2, d26                    \n"  // R

+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d27                        \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");

-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels

-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels

-    "subs       %2, %2, #16                    \n"  // 16 processed per loop

-    MEMACCESS(1)

-    "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.

-    "bgt       1b                              \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_a),      // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

-  );

+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,

+                              uint8_t* dst_a,

+                              int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels

+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels

+      "subs       %2, %2, #16                    \n"  // 16 processed per loop

+      "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.

+      "bgt       1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_a),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List

+      );

-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q2, d0, d24                    \n"  // B

-    "vmlal.u8   q2, d1, d25                    \n"  // G

-    "vmlal.u8   q2, d2, d26                    \n"  // R

-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

-  );

+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q2, d0, d24                    \n"  // B

+      "vmlal.u8   q2, d1, d25                    \n"  // G

+      "vmlal.u8   q2, d2, d26                    \n"  // R

+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");

 // 8x1 pixels.

-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+void ARGBToUV444Row_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width) {

-  asm volatile (

-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient

-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient

-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient

-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient

-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient

-    "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q2, d0, d24                    \n"  // B

-    "vmlsl.u8   q2, d1, d25                    \n"  // G

-    "vmlsl.u8   q2, d2, d26                    \n"  // R

-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned

+  asm volatile(

+      "vmov.u8    d24, #112                      \n"  // UB / VR 0.875

+                                                      // coefficient

+      "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient

+      "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient

+      "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient

+      "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient

+      "vmov.u16   q15, #0x8080                   \n"  // 128.5

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q2, d0, d24                    \n"  // B

+      "vmlsl.u8   q2, d1, d25                    \n"  // G

+      "vmlsl.u8   q2, d2, d26                    \n"  // R

+      "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned

-    "vmull.u8   q3, d2, d24                    \n"  // R

-    "vmlsl.u8   q3, d1, d28                    \n"  // G

-    "vmlsl.u8   q3, d0, d27                    \n"  // B

-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned

+      "vmull.u8   q3, d2, d24                    \n"  // R

+      "vmlsl.u8   q3, d1, d28                    \n"  // G

+      "vmlsl.u8   q3, d0, d27                    \n"  // B

+      "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned

-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U

-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V

+      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U

+      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

-    MEMACCESS(2)

-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"

-  );

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",

+        "q15");

-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.

-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int width) {

-  asm volatile (

-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

-    "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(0)

-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(0)

-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.

-    MEMACCESS(0)

-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.

-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.

-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.

-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.

-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.

-    "vpadd.u16  d1, d8, d9                     \n"  // B

-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.

-    "vpadd.u16  d3, d10, d11                   \n"  // G

-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.

-    "vpadd.u16  d5, d12, d13                   \n"  // R

-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average

-    "vrshr.u16  q1, q1, #1                     \n"

-    "vrshr.u16  q2, q2, #1                     \n"

-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.

-    "vmul.s16   q8, q0, q10                    \n"  // B

-    "vmls.s16   q8, q1, q11                    \n"  // G

-    "vmls.s16   q8, q2, q12                    \n"  // R

-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

-    "vmul.s16   q9, q2, q10                    \n"  // R

-    "vmls.s16   q9, q1, q14                    \n"  // G

-    "vmls.s16   q9, q0, q13                    \n"  // B

-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.

-    MEMACCESS(2)

-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

-}

+// clang-format off

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-#define RGBTOUV(QB, QG, QR) \

-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \

-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \

-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \

-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \

-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \

-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \

-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \

-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \

-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \

-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */

+#define RGBTOUV(QB, QG, QR)                                                 \

+  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \

+  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \

+  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \

+  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \

+  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \

+  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \

+  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \

+  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \

+  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \

+  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */

+// clang-format on

 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.

-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void ARGBToUVRow_NEON(const uint8_t* src_argb,

+                      int src_stride_argb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_argb

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1468,17 +1335,13 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

+    "1:                                        \n"

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

-    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1490,9 +1353,7 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

-    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1507,8 +1368,11 @@

 // TODO(fbarchard): Subsample match C code.

-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

+void ARGBToUVJRow_NEON(const uint8_t* src_argb,

+                       int src_stride_argb,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_argb

     "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient

@@ -1517,17 +1381,13 @@

     "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient

     "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

+    "1:                                        \n"

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.

-    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.

     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1539,9 +1399,7 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

-    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1555,8 +1413,11 @@

);

-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void BGRAToUVRow_NEON(const uint8_t* src_bgra,

+                      int src_stride_bgra,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_bgra

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1565,17 +1426,13 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

+    "1:                                        \n"

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.

-    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.

     "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.

-    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.

     "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.

@@ -1587,9 +1444,7 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q3, q2, q1)

-    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_bgra),  // %0

@@ -1603,8 +1458,11 @@

);

-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void ABGRToUVRow_NEON(const uint8_t* src_abgr,

+                      int src_stride_abgr,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_abgr

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1613,17 +1471,13 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

+    "1:                                        \n"

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.

-    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.

     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.

-    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.

     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1635,9 +1489,7 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q2, q1, q0)

-    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_abgr),  // %0

@@ -1651,8 +1503,11 @@

);

-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

-                      uint8* dst_u, uint8* dst_v, int width) {

+void RGBAToUVRow_NEON(const uint8_t* src_rgba,

+                      int src_stride_rgba,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_rgba

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1661,17 +1516,13 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

+    "1:                                        \n"

     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.

-    MEMACCESS(0)

     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.

     "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.

-    MEMACCESS(1)

     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.

     "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.

@@ -1683,9 +1534,7 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

-    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_rgba),  // %0

@@ -1699,8 +1548,11 @@

);

-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                       uint8* dst_u, uint8* dst_v, int width) {

+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,

+                       int src_stride_rgb24,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_rgb24

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1709,17 +1561,13 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

+    "1:                                        \n"

     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.

-    MEMACCESS(0)

     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.

     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.

-    MEMACCESS(1)

     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.

     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1731,9 +1579,7 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q0, q1, q2)

-    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_rgb24),  // %0

@@ -1747,8 +1593,11 @@

);

-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

-                     uint8* dst_u, uint8* dst_v, int width) {

+void RAWToUVRow_NEON(const uint8_t* src_raw,

+                     int src_stride_raw,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

   asm volatile (

     "add        %1, %0, %1                     \n"  // src_stride + src_raw

     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

@@ -1757,17 +1606,13 @@

     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

     "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

+    "1:                                        \n"

     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.

-    MEMACCESS(0)

     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.

     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.

     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.

-    MEMACCESS(1)

     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.

     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.

     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.

@@ -1779,9 +1624,7 @@

     "subs       %4, %4, #16                    \n"  // 32 processed per loop.

     RGBTOUV(q2, q1, q0)

-    MEMACCESS(2)

     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

     "bgt        1b                             \n"

   : "+r"(src_raw),  // %0

@@ -1796,686 +1639,640 @@

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                        uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "add        %1, %0, %1                     \n"  // src_stride + src_argb

-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

-    "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

-    RGB565TOARGB

-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.

-    RGB565TOARGB

-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,

+                        int src_stride_rgb565,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width) {

+  asm volatile(

+      "add        %1, %0, %1                     \n"  // src_stride + src_argb

+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875

+                                                      // coefficient

+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+      "vmov.u16   q15, #0x8080                   \n"  // 128.5

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+      RGB565TOARGB

+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.

+      RGB565TOARGB

+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.

-    RGB565TOARGB

-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.

-    RGB565TOARGB

-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.

+      RGB565TOARGB

+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.

+      RGB565TOARGB

+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

-    "vrshr.u16  q5, q5, #1                     \n"

-    "vrshr.u16  q6, q6, #1                     \n"

+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+      "vrshr.u16  q5, q5, #1                     \n"

+      "vrshr.u16  q6, q6, #1                     \n"

-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

-    "vmul.s16   q8, q4, q10                    \n"  // B

-    "vmls.s16   q8, q5, q11                    \n"  // G

-    "vmls.s16   q8, q6, q12                    \n"  // R

-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

-    "vmul.s16   q9, q6, q10                    \n"  // R

-    "vmls.s16   q9, q5, q14                    \n"  // G

-    "vmls.s16   q9, q4, q13                    \n"  // B

-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

-    MEMACCESS(2)

-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

-    "bgt        1b                             \n"

-  : "+r"(src_rgb565),  // %0

-    "+r"(src_stride_rgb565),  // %1

-    "+r"(dst_u),     // %2

-    "+r"(dst_v),     // %3

-    "+r"(width)        // %4

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+      "vmul.s16   q8, q4, q10                    \n"  // B

+      "vmls.s16   q8, q5, q11                    \n"  // G

+      "vmls.s16   q8, q6, q12                    \n"  // R

+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+      "vmul.s16   q9, q6, q10                    \n"  // R

+      "vmls.s16   q9, q5, q14                    \n"  // G

+      "vmls.s16   q9, q4, q13                    \n"  // B

+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+      "bgt        1b                             \n"

+      : "+r"(src_rgb565),         // %0

+        "+r"(src_stride_rgb565),  // %1

+        "+r"(dst_u),              // %2

+        "+r"(dst_v),              // %3

+        "+r"(width)               // %4

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",

+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

-                        uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "add        %1, %0, %1                     \n"  // src_stride + src_argb

-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

-    "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,

+                          int src_stride_argb1555,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width) {

+  asm volatile(

+      "add        %1, %0, %1                     \n"  // src_stride + src_argb

+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875

+                                                      // coefficient

+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+      "vmov.u16   q15, #0x8080                   \n"  // 128.5

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

-    "vrshr.u16  q5, q5, #1                     \n"

-    "vrshr.u16  q6, q6, #1                     \n"

+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+      "vrshr.u16  q5, q5, #1                     \n"

+      "vrshr.u16  q6, q6, #1                     \n"

-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

-    "vmul.s16   q8, q4, q10                    \n"  // B

-    "vmls.s16   q8, q5, q11                    \n"  // G

-    "vmls.s16   q8, q6, q12                    \n"  // R

-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

-    "vmul.s16   q9, q6, q10                    \n"  // R

-    "vmls.s16   q9, q5, q14                    \n"  // G

-    "vmls.s16   q9, q4, q13                    \n"  // B

-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

-    MEMACCESS(2)

-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

-    "bgt        1b                             \n"

-  : "+r"(src_argb1555),  // %0

-    "+r"(src_stride_argb1555),  // %1

-    "+r"(dst_u),     // %2

-    "+r"(dst_v),     // %3

-    "+r"(width)        // %4

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+      "vmul.s16   q8, q4, q10                    \n"  // B

+      "vmls.s16   q8, q5, q11                    \n"  // G

+      "vmls.s16   q8, q6, q12                    \n"  // R

+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+      "vmul.s16   q9, q6, q10                    \n"  // R

+      "vmls.s16   q9, q5, q14                    \n"  // G

+      "vmls.s16   q9, q4, q13                    \n"  // B

+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+      "bgt        1b                             \n"

+      : "+r"(src_argb1555),         // %0

+        "+r"(src_stride_argb1555),  // %1

+        "+r"(dst_u),                // %2

+        "+r"(dst_v),                // %3

+        "+r"(width)                 // %4

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",

+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

-                          uint8* dst_u, uint8* dst_v, int width) {

-  asm volatile (

-    "add        %1, %0, %1                     \n"  // src_stride + src_argb

-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient

-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

-    "vmov.u16   q15, #0x8080                   \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,

+                          int src_stride_argb4444,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width) {

+  asm volatile(

+      "add        %1, %0, %1                     \n"  // src_stride + src_argb

+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875

+                                                      // coefficient

+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient

+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient

+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient

+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient

+      "vmov.u16   q15, #0x8080                   \n"  // 128.5

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.

+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.

+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.

+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.

-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average

-    "vrshr.u16  q5, q5, #1                     \n"

-    "vrshr.u16  q6, q6, #1                     \n"

+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average

+      "vrshr.u16  q5, q5, #1                     \n"

+      "vrshr.u16  q6, q6, #1                     \n"

-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.

-    "vmul.s16   q8, q4, q10                    \n"  // B

-    "vmls.s16   q8, q5, q11                    \n"  // G

-    "vmls.s16   q8, q6, q12                    \n"  // R

-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

-    "vmul.s16   q9, q6, q10                    \n"  // R

-    "vmls.s16   q9, q5, q14                    \n"  // G

-    "vmls.s16   q9, q4, q13                    \n"  // B

-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

-    MEMACCESS(2)

-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

-    MEMACCESS(3)

-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

-    "bgt        1b                             \n"

-  : "+r"(src_argb4444),  // %0

-    "+r"(src_stride_argb4444),  // %1

-    "+r"(dst_u),     // %2

-    "+r"(dst_v),     // %3

-    "+r"(width)        // %4

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",

-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.

+      "vmul.s16   q8, q4, q10                    \n"  // B

+      "vmls.s16   q8, q5, q11                    \n"  // G

+      "vmls.s16   q8, q6, q12                    \n"  // R

+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned

+      "vmul.s16   q9, q6, q10                    \n"  // R

+      "vmls.s16   q9, q5, q14                    \n"  // G

+      "vmls.s16   q9, q4, q13                    \n"  // B

+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned

+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U

+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V

+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.

+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.

+      "bgt        1b                             \n"

+      : "+r"(src_argb4444),         // %0

+        "+r"(src_stride_argb4444),  // %1

+        "+r"(dst_u),                // %2

+        "+r"(dst_v),                // %3

+        "+r"(width)                 // %4

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",

+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");

-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

-    "vmov.u8    d27, #16                       \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    RGB565TOARGB

-    "vmull.u8   q2, d0, d24                    \n"  // B

-    "vmlal.u8   q2, d1, d25                    \n"  // G

-    "vmlal.u8   q2, d2, d26                    \n"  // R

-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d27                        \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_rgb565),  // %0

-    "+r"(dst_y),       // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

-  );

+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+      "vmov.u8    d27, #16                       \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      RGB565TOARGB

+      "vmull.u8   q2, d0, d24                    \n"  // B

+      "vmlal.u8   q2, d1, d25                    \n"  // G

+      "vmlal.u8   q2, d2, d26                    \n"  // R

+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d27                        \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_rgb565),  // %0

+        "+r"(dst_y),       // %1

+        "+r"(width)        // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");

-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

-    "vmov.u8    d27, #16                       \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    ARGB1555TOARGB

-    "vmull.u8   q2, d0, d24                    \n"  // B

-    "vmlal.u8   q2, d1, d25                    \n"  // G

-    "vmlal.u8   q2, d2, d26                    \n"  // R

-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d27                        \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_argb1555),  // %0

-    "+r"(dst_y),         // %1

-    "+r"(width)            // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

-  );

+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,

+                         uint8_t* dst_y,

+                         int width) {

+  asm volatile(

+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+      "vmov.u8    d27, #16                       \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      ARGB1555TOARGB

+      "vmull.u8   q2, d0, d24                    \n"  // B

+      "vmlal.u8   q2, d1, d25                    \n"  // G

+      "vmlal.u8   q2, d2, d26                    \n"  // R

+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d27                        \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_argb1555),  // %0

+        "+r"(dst_y),         // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");

-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

-    "vmov.u8    d27, #16                       \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    ARGB4444TOARGB

-    "vmull.u8   q2, d0, d24                    \n"  // B

-    "vmlal.u8   q2, d1, d25                    \n"  // G

-    "vmlal.u8   q2, d2, d26                    \n"  // R

-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d27                        \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_argb4444),  // %0

-    "+r"(dst_y),         // %1

-    "+r"(width)            // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"

-  );

+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,

+                         uint8_t* dst_y,

+                         int width) {

+  asm volatile(

+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient

+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient

+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient

+      "vmov.u8    d27, #16                       \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      ARGB4444TOARGB

+      "vmull.u8   q2, d0, d24                    \n"  // B

+      "vmlal.u8   q2, d1, d25                    \n"  // G

+      "vmlal.u8   q2, d2, d26                    \n"  // R

+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d27                        \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_argb4444),  // %0

+        "+r"(dst_y),         // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");

-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

-    "vmov.u8    d7, #16                        \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q8, d1, d4                     \n"  // R

-    "vmlal.u8   q8, d2, d5                     \n"  // G

-    "vmlal.u8   q8, d3, d6                     \n"  // B

-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d7                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_bgra),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

-  );

+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+      "vmov.u8    d7, #16                        \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q8, d1, d4                     \n"  // R

+      "vmlal.u8   q8, d2, d5                     \n"  // G

+      "vmlal.u8   q8, d3, d6                     \n"  // B

+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d7                         \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_bgra),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");

-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

-    "vmov.u8    d7, #16                        \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q8, d0, d4                     \n"  // R

-    "vmlal.u8   q8, d1, d5                     \n"  // G

-    "vmlal.u8   q8, d2, d6                     \n"  // B

-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d7                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_abgr),  // %0

-    "+r"(dst_y),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

-  );

+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+      "vmov.u8    d7, #16                        \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q8, d0, d4                     \n"  // R

+      "vmlal.u8   q8, d1, d5                     \n"  // G

+      "vmlal.u8   q8, d2, d6                     \n"  // B

+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d7                         \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_abgr),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");

-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

-    "vmov.u8    d7, #16                        \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q8, d1, d4                     \n"  // B

-    "vmlal.u8   q8, d2, d5                     \n"  // G

-    "vmlal.u8   q8, d3, d6                     \n"  // R

-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d7                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_rgba),  // %0

-    "+r"(dst_y),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

-  );

+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

+      "vmov.u8    d7, #16                        \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q8, d1, d4                     \n"  // B

+      "vmlal.u8   q8, d2, d5                     \n"  // G

+      "vmlal.u8   q8, d3, d6                     \n"  // R

+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d7                         \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_rgba),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");

-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

-    "vmov.u8    d7, #16                        \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q8, d0, d4                     \n"  // B

-    "vmlal.u8   q8, d1, d5                     \n"  // G

-    "vmlal.u8   q8, d2, d6                     \n"  // R

-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d7                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_rgb24),  // %0

-    "+r"(dst_y),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

-  );

+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient

+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient

+      "vmov.u8    d7, #16                        \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q8, d0, d4                     \n"  // B

+      "vmlal.u8   q8, d1, d5                     \n"  // G

+      "vmlal.u8   q8, d2, d6                     \n"  // R

+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d7                         \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_rgb24),  // %0

+        "+r"(dst_y),      // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");

-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {

-  asm volatile (

-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

-    "vmov.u8    d7, #16                        \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q8, d0, d4                     \n"  // B

-    "vmlal.u8   q8, d1, d5                     \n"  // G

-    "vmlal.u8   q8, d2, d6                     \n"  // R

-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

-    "vqadd.u8   d0, d7                         \n"

-    MEMACCESS(1)

-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

-    "bgt        1b                             \n"

-  : "+r"(src_raw),  // %0

-    "+r"(dst_y),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"

-  );

+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {

+  asm volatile(

+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient

+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient

+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient

+      "vmov.u8    d7, #16                        \n"  // Add 16 constant

+      "1:                                        \n"

+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q8, d0, d4                     \n"  // B

+      "vmlal.u8   q8, d1, d5                     \n"  // G

+      "vmlal.u8   q8, d2, d6                     \n"  // R

+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y

+      "vqadd.u8   d0, d7                         \n"

+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.

+      "bgt        1b                             \n"

+      : "+r"(src_raw),  // %0

+        "+r"(dst_y),    // %1

+        "+r"(width)     // %2

+      :

+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");

 // Bilinear filter 16x2 -> 16x1

-void InterpolateRow_NEON(uint8* dst_ptr,

-                         const uint8* src_ptr, ptrdiff_t src_stride,

-                         int dst_width, int source_y_fraction) {

+void InterpolateRow_NEON(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         int dst_width,

+                         int source_y_fraction) {

   int y1_fraction = source_y_fraction;

-  asm volatile (

-    "cmp        %4, #0                         \n"

-    "beq        100f                           \n"

-    "add        %2, %1                         \n"

-    "cmp        %4, #128                       \n"

-    "beq        50f                            \n"

+  asm volatile(

+      "cmp        %4, #0                         \n"

+      "beq        100f                           \n"

+      "add        %2, %1                         \n"

+      "cmp        %4, #128                       \n"

+      "beq        50f                            \n"

-    "vdup.8     d5, %4                         \n"

-    "rsb        %4, #256                       \n"

-    "vdup.8     d4, %4                         \n"

-    // General purpose row blend.

-  "1:                                          \n"

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"

-    MEMACCESS(2)

-    "vld1.8     {q1}, [%2]!                    \n"

-    "subs       %3, %3, #16                    \n"

-    "vmull.u8   q13, d0, d4                    \n"

-    "vmull.u8   q14, d1, d4                    \n"

-    "vmlal.u8   q13, d2, d5                    \n"

-    "vmlal.u8   q14, d3, d5                    \n"

-    "vrshrn.u16 d0, q13, #8                    \n"

-    "vrshrn.u16 d1, q14, #8                    \n"

-    MEMACCESS(0)

-    "vst1.8     {q0}, [%0]!                    \n"

-    "bgt        1b                             \n"

-    "b          99f                            \n"

+      "vdup.8     d5, %4                         \n"

+      "rsb        %4, #256                       \n"

+      "vdup.8     d4, %4                         \n"

+      // General purpose row blend.

+      "1:                                        \n"

+      "vld1.8     {q0}, [%1]!                    \n"

+      "vld1.8     {q1}, [%2]!                    \n"

+      "subs       %3, %3, #16                    \n"

+      "vmull.u8   q13, d0, d4                    \n"

+      "vmull.u8   q14, d1, d4                    \n"

+      "vmlal.u8   q13, d2, d5                    \n"

+      "vmlal.u8   q14, d3, d5                    \n"

+      "vrshrn.u16 d0, q13, #8                    \n"

+      "vrshrn.u16 d1, q14, #8                    \n"

+      "vst1.8     {q0}, [%0]!                    \n"

+      "bgt        1b                             \n"

+      "b          99f                            \n"

-    // Blend 50 / 50.

-  "50:                                         \n"

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"

-    MEMACCESS(2)

-    "vld1.8     {q1}, [%2]!                    \n"

-    "subs       %3, %3, #16                    \n"

-    "vrhadd.u8  q0, q1                         \n"

-    MEMACCESS(0)

-    "vst1.8     {q0}, [%0]!                    \n"

-    "bgt        50b                            \n"

-    "b          99f                            \n"

+      // Blend 50 / 50.

+      "50:                                       \n"

+      "vld1.8     {q0}, [%1]!                    \n"

+      "vld1.8     {q1}, [%2]!                    \n"

+      "subs       %3, %3, #16                    \n"

+      "vrhadd.u8  q0, q1                         \n"

+      "vst1.8     {q0}, [%0]!                    \n"

+      "bgt        50b                            \n"

+      "b          99f                            \n"

-    // Blend 100 / 0 - Copy row unchanged.

-  "100:                                        \n"

-    MEMACCESS(1)

-    "vld1.8     {q0}, [%1]!                    \n"

-    "subs       %3, %3, #16                    \n"

-    MEMACCESS(0)

-    "vst1.8     {q0}, [%0]!                    \n"

-    "bgt        100b                           \n"

+      // Blend 100 / 0 - Copy row unchanged.

+      "100:                                      \n"

+      "vld1.8     {q0}, [%1]!                    \n"

+      "subs       %3, %3, #16                    \n"

+      "vst1.8     {q0}, [%0]!                    \n"

+      "bgt        100b                           \n"

-  "99:                                         \n"

-  : "+r"(dst_ptr),          // %0

-    "+r"(src_ptr),          // %1

-    "+r"(src_stride),       // %2

-    "+r"(dst_width),        // %3

-    "+r"(y1_fraction)       // %4

-  :

-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"

-  );

+      "99:                                       \n"

+      : "+r"(dst_ptr),     // %0

+        "+r"(src_ptr),     // %1

+        "+r"(src_stride),  // %2

+        "+r"(dst_width),   // %3

+        "+r"(y1_fraction)  // %4

+      :

+      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");

 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr

-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                       uint8* dst_argb, int width) {

-  asm volatile (

-    "subs       %3, #8                         \n"

-    "blt        89f                            \n"

-    // Blend 8 pixels.

-  "8:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.

-    MEMACCESS(1)

-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q10, d4, d3                    \n"  // db * a

-    "vmull.u8   q11, d5, d3                    \n"  // dg * a

-    "vmull.u8   q12, d6, d3                    \n"  // dr * a

-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8

-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8

-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8

-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256

-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256

-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg

-    "vqadd.u8   d2, d2, d6                     \n"  // + sr

-    "vmov.u8    d3, #255                       \n"  // a = 255

-    MEMACCESS(2)

-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.

-    "bge        8b                             \n"

+void ARGBBlendRow_NEON(const uint8_t* src_argb0,

+                       const uint8_t* src_argb1,

+                       uint8_t* dst_argb,

+                       int width) {

+  asm volatile(

+      "subs       %3, #8                         \n"

+      "blt        89f                            \n"

+      // Blend 8 pixels.

+      "8:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.

+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q10, d4, d3                    \n"  // db * a

+      "vmull.u8   q11, d5, d3                    \n"  // dg * a

+      "vmull.u8   q12, d6, d3                    \n"  // dr * a

+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8

+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8

+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8

+      "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256

+      "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256

+      "vqadd.u8   q0, q0, q2                     \n"  // + sbg

+      "vqadd.u8   d2, d2, d6                     \n"  // + sr

+      "vmov.u8    d3, #255                       \n"  // a = 255

+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.

+      "bge        8b                             \n"

-  "89:                                         \n"

-    "adds       %3, #8-1                       \n"

-    "blt        99f                            \n"

+      "89:                                       \n"

+      "adds       %3, #8-1                       \n"

+      "blt        99f                            \n"

-    // Blend 1 pixels.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.

-    MEMACCESS(1)

-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.

-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.

-    "vmull.u8   q10, d4, d3                    \n"  // db * a

-    "vmull.u8   q11, d5, d3                    \n"  // dg * a

-    "vmull.u8   q12, d6, d3                    \n"  // dr * a

-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8

-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8

-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8

-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256

-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256

-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg

-    "vqadd.u8   d2, d2, d6                     \n"  // + sr

-    "vmov.u8    d3, #255                       \n"  // a = 255

-    MEMACCESS(2)

-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.

-    "bge        1b                             \n"

+      // Blend 1 pixels.

+      "1:                                        \n"

+      "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.

+      "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.

+      "subs       %3, %3, #1                     \n"    // 1 processed per loop.

+      "vmull.u8   q10, d4, d3                    \n"    // db * a

+      "vmull.u8   q11, d5, d3                    \n"    // dg * a

+      "vmull.u8   q12, d6, d3                    \n"    // dr * a

+      "vqrshrn.u16 d20, q10, #8                  \n"    // db >>= 8

+      "vqrshrn.u16 d21, q11, #8                  \n"    // dg >>= 8

+      "vqrshrn.u16 d22, q12, #8                  \n"    // dr >>= 8

+      "vqsub.u8   q2, q2, q10                    \n"    // dbg - dbg * a / 256

+      "vqsub.u8   d6, d6, d22                    \n"    // dr - dr * a / 256

+      "vqadd.u8   q0, q0, q2                     \n"    // + sbg

+      "vqadd.u8   d2, d2, d6                     \n"    // + sr

+      "vmov.u8    d3, #255                       \n"    // a = 255

+      "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.

+      "bge        1b                             \n"

-  "99:                                         \n"

+      "99:                                         \n"

-  : "+r"(src_argb0),    // %0

-    "+r"(src_argb1),    // %1

-    "+r"(dst_argb),     // %2

-    "+r"(width)         // %3

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"

-  );

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");

 // Attenuate 8 pixels at a time.

-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

-  asm volatile (

-    // Attenuate 8 pixels.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q10, d0, d3                    \n"  // b * a

-    "vmull.u8   q11, d1, d3                    \n"  // g * a

-    "vmull.u8   q12, d2, d3                    \n"  // r * a

-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8

-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8

-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8

-    MEMACCESS(1)

-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_argb),   // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"

-  );

+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,

+                           uint8_t* dst_argb,

+                           int width) {

+  asm volatile(

+      // Attenuate 8 pixels.

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q10, d0, d3                    \n"  // b * a

+      "vmull.u8   q11, d1, d3                    \n"  // g * a

+      "vmull.u8   q12, d2, d3                    \n"  // r * a

+      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8

+      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8

+      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8

+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");

 // Quantize 8 ARGB pixels (32 bytes).

 // dst = (dst * scale >> 16) * interval_size + interval_offset;

-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,

-                          int interval_offset, int width) {

-  asm volatile (

-    "vdup.u16   q8, %2                         \n"

-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1

-    "vdup.u16   q9, %3                         \n"  // interval multiply.

-    "vdup.u16   q10, %4                        \n"  // interval add

+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,

+                          int scale,

+                          int interval_size,

+                          int interval_offset,

+                          int width) {

+  asm volatile(

+      "vdup.u16   q8, %2                         \n"

+      "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1

+      "vdup.u16   q9, %3                         \n"  // interval multiply.

+      "vdup.u16   q10, %4                        \n"  // interval add

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.

-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.

-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)

-    "vmovl.u8   q1, d2                         \n"

-    "vmovl.u8   q2, d4                         \n"

-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale

-    "vqdmulh.s16 q1, q1, q8                    \n"  // g

-    "vqdmulh.s16 q2, q2, q8                    \n"  // r

-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size

-    "vmul.u16   q1, q1, q9                     \n"  // g

-    "vmul.u16   q2, q2, q9                     \n"  // r

-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset

-    "vadd.u16   q1, q1, q10                    \n"  // g

-    "vadd.u16   q2, q2, q10                    \n"  // r

-    "vqmovn.u16 d0, q0                         \n"

-    "vqmovn.u16 d2, q1                         \n"

-    "vqmovn.u16 d4, q2                         \n"

-    MEMACCESS(0)

-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(dst_argb),       // %0

-    "+r"(width)           // %1

-  : "r"(scale),           // %2

-    "r"(interval_size),   // %3

-    "r"(interval_offset)  // %4

-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"

-  );

+      // 8 pixel loop.

+      "1:                                        \n"

+      "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.

+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.

+      "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)

+      "vmovl.u8   q1, d2                         \n"

+      "vmovl.u8   q2, d4                         \n"

+      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale

+      "vqdmulh.s16 q1, q1, q8                    \n"  // g

+      "vqdmulh.s16 q2, q2, q8                    \n"  // r

+      "vmul.u16   q0, q0, q9                     \n"  // b * interval_size

+      "vmul.u16   q1, q1, q9                     \n"  // g

+      "vmul.u16   q2, q2, q9                     \n"  // r

+      "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset

+      "vadd.u16   q1, q1, q10                    \n"  // g

+      "vadd.u16   q2, q2, q10                    \n"  // r

+      "vqmovn.u16 d0, q0                         \n"

+      "vqmovn.u16 d2, q1                         \n"

+      "vqmovn.u16 d4, q2                         \n"

+      "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(dst_argb),       // %0

+        "+r"(width)           // %1

+      : "r"(scale),           // %2

+        "r"(interval_size),   // %3

+        "r"(interval_offset)  // %4

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");

 // Shade 8 pixels at a time by specified value.

 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.

 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.

-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,

-                       uint32 value) {

-  asm volatile (

-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.

-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.

-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.

+void ARGBShadeRow_NEON(const uint8_t* src_argb,

+                       uint8_t* dst_argb,

+                       int width,

+                       uint32_t value) {

+  asm volatile(

+      "vdup.u32   q0, %3                         \n"  // duplicate scale value.

+      "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.

+      "vshr.u16   q0, q0, #1                     \n"  // scale / 2.

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)

-    "vmovl.u8   q11, d22                       \n"

-    "vmovl.u8   q12, d24                       \n"

-    "vmovl.u8   q13, d26                       \n"

-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2

-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g

-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r

-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a

-    "vqmovn.u16 d20, q10                       \n"

-    "vqmovn.u16 d22, q11                       \n"

-    "vqmovn.u16 d24, q12                       \n"

-    "vqmovn.u16 d26, q13                       \n"

-    MEMACCESS(1)

-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),       // %0

-    "+r"(dst_argb),       // %1

-    "+r"(width)           // %2

-  : "r"(value)            // %3

-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"

-  );

+      // 8 pixel loop.

+      "1:                                        \n"

+      "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)

+      "vmovl.u8   q11, d22                       \n"

+      "vmovl.u8   q12, d24                       \n"

+      "vmovl.u8   q13, d26                       \n"

+      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2

+      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g

+      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r

+      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a

+      "vqmovn.u16 d20, q10                       \n"

+      "vqmovn.u16 d22, q11                       \n"

+      "vqmovn.u16 d24, q12                       \n"

+      "vqmovn.u16 d26, q13                       \n"

+      "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "r"(value)       // %3

+      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");

 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels

 // Similar to ARGBToYJ but stores ARGB.

 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;

-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

-  asm volatile (

-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q2, d0, d24                    \n"  // B

-    "vmlal.u8   q2, d1, d25                    \n"  // G

-    "vmlal.u8   q2, d2, d26                    \n"  // R

-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B

-    "vmov       d1, d0                         \n"  // G

-    "vmov       d2, d0                         \n"  // R

-    MEMACCESS(1)

-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)      // %2

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"

-  );

+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient

+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient

+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q2, d0, d24                    \n"  // B

+      "vmlal.u8   q2, d1, d25                    \n"  // G

+      "vmlal.u8   q2, d2, d26                    \n"  // R

+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B

+      "vmov       d1, d0                         \n"  // G

+      "vmov       d2, d0                         \n"  // R

+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");

 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

@@ -2482,189 +2279,175 @@

 //    b = (r * 35 + g * 68 + b * 17) >> 7

 //    g = (r * 45 + g * 88 + b * 22) >> 7

 //    r = (r * 50 + g * 98 + b * 24) >> 7

-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {

-  asm volatile (

-    "vmov.u8    d20, #17                       \n"  // BB coefficient

-    "vmov.u8    d21, #68                       \n"  // BG coefficient

-    "vmov.u8    d22, #35                       \n"  // BR coefficient

-    "vmov.u8    d24, #22                       \n"  // GB coefficient

-    "vmov.u8    d25, #88                       \n"  // GG coefficient

-    "vmov.u8    d26, #45                       \n"  // GR coefficient

-    "vmov.u8    d28, #24                       \n"  // BB coefficient

-    "vmov.u8    d29, #98                       \n"  // BG coefficient

-    "vmov.u8    d30, #50                       \n"  // BR coefficient

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.

-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B

-    "vmlal.u8   q2, d1, d21                    \n"  // G

-    "vmlal.u8   q2, d2, d22                    \n"  // R

-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G

-    "vmlal.u8   q3, d1, d25                    \n"  // G

-    "vmlal.u8   q3, d2, d26                    \n"  // R

-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R

-    "vmlal.u8   q8, d1, d29                    \n"  // G

-    "vmlal.u8   q8, d2, d30                    \n"  // R

-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B

-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G

-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R

-    MEMACCESS(0)

-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(dst_argb),  // %0

-    "+r"(width)      // %1

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3",

-    "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {

+  asm volatile(

+      "vmov.u8    d20, #17                       \n"  // BB coefficient

+      "vmov.u8    d21, #68                       \n"  // BG coefficient

+      "vmov.u8    d22, #35                       \n"  // BR coefficient

+      "vmov.u8    d24, #22                       \n"  // GB coefficient

+      "vmov.u8    d25, #88                       \n"  // GG coefficient

+      "vmov.u8    d26, #45                       \n"  // GR coefficient

+      "vmov.u8    d28, #24                       \n"  // BB coefficient

+      "vmov.u8    d29, #98                       \n"  // BG coefficient

+      "vmov.u8    d30, #50                       \n"  // BR coefficient

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.

+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B

+      "vmlal.u8   q2, d1, d21                    \n"  // G

+      "vmlal.u8   q2, d2, d22                    \n"  // R

+      "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G

+      "vmlal.u8   q3, d1, d25                    \n"  // G

+      "vmlal.u8   q3, d2, d26                    \n"  // R

+      "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R

+      "vmlal.u8   q8, d1, d29                    \n"  // G

+      "vmlal.u8   q8, d2, d30                    \n"  // R

+      "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B

+      "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G

+      "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R

+      "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(dst_argb),  // %0

+        "+r"(width)      // %1

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",

+        "q14", "q15");

 // Tranform 8 ARGB pixels (32 bytes) with color matrix.

 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function

 // needs to saturate.  Consider doing a non-saturating version.

-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                             const int8* matrix_argb, int width) {

-  asm volatile (

-    MEMACCESS(3)

-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.

-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.

-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.

+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

+                             const int8_t* matrix_argb,

+                             int width) {

+  asm volatile(

+      "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.

+      "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.

+      "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.

-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit

-    "vmovl.u8   q9, d18                        \n"  // g

-    "vmovl.u8   q10, d20                       \n"  // r

-    "vmovl.u8   q11, d22                       \n"  // a

-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B

-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G

-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R

-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A

-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B

-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G

-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R

-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A

-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B

-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G

-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R

-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A

-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

-    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B

-    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G

-    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R

-    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A

-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B

-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G

-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R

-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A

-    MEMACCESS(1)

-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_argb),   // %1

-    "+r"(width)       // %2

-  : "r"(matrix_argb)  // %3

-  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",

-    "q10", "q11", "q12", "q13", "q14", "q15"

-  );

+      "1:                                        \n"

+      "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.

+      "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit

+      "vmovl.u8   q9, d18                        \n"  // g

+      "vmovl.u8   q10, d20                       \n"  // r

+      "vmovl.u8   q11, d22                       \n"  // a

+      "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B

+      "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G

+      "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R

+      "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A

+      "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B

+      "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G

+      "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R

+      "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A

+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+      "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B

+      "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G

+      "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R

+      "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A

+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+      "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B

+      "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G

+      "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R

+      "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A

+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B

+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G

+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R

+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A

+      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B

+      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G

+      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R

+      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A

+      "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_argb),   // %0

+        "+r"(dst_argb),   // %1

+        "+r"(width)       // %2

+      : "r"(matrix_argb)  // %3

+      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",

+        "q10", "q11", "q12", "q13", "q14", "q15");

 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(1)

-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vmull.u8   q0, d0, d1                     \n"  // multiply B

-    "vmull.u8   q1, d2, d3                     \n"  // multiply G

-    "vmull.u8   q2, d4, d5                     \n"  // multiply R

-    "vmull.u8   q3, d6, d7                     \n"  // multiply A

-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B

-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G

-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R

-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A

-    MEMACCESS(2)

-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3"

-  );

+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      // 8 pixel loop.

+      "1:                                        \n"

+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+      "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vmull.u8   q0, d0, d1                     \n"  // multiply B

+      "vmull.u8   q1, d2, d3                     \n"  // multiply G

+      "vmull.u8   q2, d4, d5                     \n"  // multiply R

+      "vmull.u8   q3, d6, d7                     \n"  // multiply A

+      "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B

+      "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G

+      "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R

+      "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A

+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3");

 // Add 2 rows of ARGB pixels together, 8 pixels at a time.

-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(1)

-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G

-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A

-    MEMACCESS(2)

-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3"

-  );

+void ARGBAddRow_NEON(const uint8_t* src_argb0,

+                     const uint8_t* src_argb1,

+                     uint8_t* dst_argb,

+                     int width) {

+  asm volatile(

+      // 8 pixel loop.

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vqadd.u8   q0, q0, q2                     \n"  // add B, G

+      "vqadd.u8   q1, q1, q3                     \n"  // add R, A

+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3");

 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.

-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(1)

-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G

-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A

-    MEMACCESS(2)

-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "cc", "memory", "q0", "q1", "q2", "q3"

-  );

+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      // 8 pixel loop.

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.

+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G

+      "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A

+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "q0", "q1", "q2", "q3");

 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.

@@ -2672,54 +2455,50 @@

 // R = Sobel

 // G = Sobel

 // B = Sobel

-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    "vmov.u8    d3, #255                       \n"  // alpha

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.

-    MEMACCESS(1)

-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vqadd.u8   d0, d0, d1                     \n"  // add

-    "vmov.u8    d1, d0                         \n"

-    "vmov.u8    d2, d0                         \n"

-    MEMACCESS(2)

-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "q0", "q1"

-  );

+void SobelRow_NEON(const uint8_t* src_sobelx,

+                   const uint8_t* src_sobely,

+                   uint8_t* dst_argb,

+                   int width) {

+  asm volatile(

+      "vmov.u8    d3, #255                       \n"  // alpha

+      // 8 pixel loop.

+      "1:                                        \n"

+      "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.

+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vqadd.u8   d0, d0, d1                     \n"  // add

+      "vmov.u8    d1, d0                         \n"

+      "vmov.u8    d2, d0                         \n"

+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_argb),    // %2

+        "+r"(width)        // %3

+      :

+      : "cc", "memory", "q0", "q1");

 // Adds Sobel X and Sobel Y and stores Sobel into plane.

-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                          uint8* dst_y, int width) {

-  asm volatile (

-    // 16 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.

-    MEMACCESS(1)

-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.

-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.

-    "vqadd.u8   q0, q0, q1                     \n"  // add

-    MEMACCESS(2)

-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_y),       // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "q0", "q1"

-  );

+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,

+                          const uint8_t* src_sobely,

+                          uint8_t* dst_y,

+                          int width) {

+  asm volatile(

+      // 16 pixel loop.

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.

+      "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.

+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.

+      "vqadd.u8   q0, q0, q1                     \n"  // add

+      "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_y),       // %2

+        "+r"(width)        // %3

+      :

+      : "cc", "memory", "q0", "q1");

 // Mixes Sobel X, Sobel Y and Sobel into ARGB.

@@ -2727,28 +2506,26 @@

 // R = Sobel X

 // G = Sobel

 // B = Sobel Y

-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    "vmov.u8    d3, #255                       \n"  // alpha

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.

-    MEMACCESS(1)

-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vqadd.u8   d1, d0, d2                     \n"  // add

-    MEMACCESS(2)

-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

-    "bgt        1b                             \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "q0", "q1"

-  );

+void SobelXYRow_NEON(const uint8_t* src_sobelx,

+                     const uint8_t* src_sobely,

+                     uint8_t* dst_argb,

+                     int width) {

+  asm volatile(

+      "vmov.u8    d3, #255                       \n"  // alpha

+      // 8 pixel loop.

+      "1:                                        \n"

+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.

+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vqadd.u8   d1, d0, d2                     \n"  // add

+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.

+      "bgt        1b                             \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_argb),    // %2

+        "+r"(width)        // %3

+      :

+      : "cc", "memory", "q0", "q1");

 // SobelX as a matrix is

@@ -2755,43 +2532,39 @@

 // -1  0  1

 // -2  0  2

 // -1  0  1

-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

-                    const uint8* src_y2, uint8* dst_sobelx, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d0}, [%0],%5                  \n"  // top

-    MEMACCESS(0)

-    "vld1.8     {d1}, [%0],%6                  \n"

-    "vsubl.u8   q0, d0, d1                     \n"

-    MEMACCESS(1)

-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2

-    MEMACCESS(1)

-    "vld1.8     {d3}, [%1],%6                  \n"

-    "vsubl.u8   q1, d2, d3                     \n"

-    "vadd.s16   q0, q0, q1                     \n"

-    "vadd.s16   q0, q0, q1                     \n"

-    MEMACCESS(2)

-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom

-    MEMACCESS(2)

-    "vld1.8     {d3}, [%2],%6                  \n"

-    "subs       %4, %4, #8                     \n"  // 8 pixels

-    "vsubl.u8   q1, d2, d3                     \n"

-    "vadd.s16   q0, q0, q1                     \n"

-    "vabs.s16   q0, q0                         \n"

-    "vqmovn.u16 d0, q0                         \n"

-    MEMACCESS(3)

-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx

-    "bgt        1b                             \n"

-  : "+r"(src_y0),      // %0

-    "+r"(src_y1),      // %1

-    "+r"(src_y2),      // %2

-    "+r"(dst_sobelx),  // %3

-    "+r"(width)        // %4

-  : "r"(2),            // %5

-    "r"(6)             // %6

-  : "cc", "memory", "q0", "q1"  // Clobber List

-  );

+void SobelXRow_NEON(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    const uint8_t* src_y2,

+                    uint8_t* dst_sobelx,

+                    int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {d0}, [%0],%5                  \n"  // top

+      "vld1.8     {d1}, [%0],%6                  \n"

+      "vsubl.u8   q0, d0, d1                     \n"

+      "vld1.8     {d2}, [%1],%5                  \n"  // center * 2

+      "vld1.8     {d3}, [%1],%6                  \n"

+      "vsubl.u8   q1, d2, d3                     \n"

+      "vadd.s16   q0, q0, q1                     \n"

+      "vadd.s16   q0, q0, q1                     \n"

+      "vld1.8     {d2}, [%2],%5                  \n"  // bottom

+      "vld1.8     {d3}, [%2],%6                  \n"

+      "subs       %4, %4, #8                     \n"  // 8 pixels

+      "vsubl.u8   q1, d2, d3                     \n"

+      "vadd.s16   q0, q0, q1                     \n"

+      "vabs.s16   q0, q0                         \n"

+      "vqmovn.u16 d0, q0                         \n"

+      "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx

+      "bgt        1b                             \n"

+      : "+r"(src_y0),               // %0

+        "+r"(src_y1),               // %1

+        "+r"(src_y2),               // %2

+        "+r"(dst_sobelx),           // %3

+        "+r"(width)                 // %4

+      : "r"(2),                     // %5

+        "r"(6)                      // %6

+      : "cc", "memory", "q0", "q1"  // Clobber List

+      );

 // SobelY as a matrix is

@@ -2798,44 +2571,121 @@

 // -1 -2 -1

 //  0  0  0

 //  1  2  1

-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

-                    uint8* dst_sobely, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d0}, [%0],%4                  \n"  // left

-    MEMACCESS(1)

-    "vld1.8     {d1}, [%1],%4                  \n"

-    "vsubl.u8   q0, d0, d1                     \n"

-    MEMACCESS(0)

-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2

-    MEMACCESS(1)

-    "vld1.8     {d3}, [%1],%4                  \n"

-    "vsubl.u8   q1, d2, d3                     \n"

-    "vadd.s16   q0, q0, q1                     \n"

-    "vadd.s16   q0, q0, q1                     \n"

-    MEMACCESS(0)

-    "vld1.8     {d2}, [%0],%5                  \n"  // right

-    MEMACCESS(1)

-    "vld1.8     {d3}, [%1],%5                  \n"

-    "subs       %3, %3, #8                     \n"  // 8 pixels

-    "vsubl.u8   q1, d2, d3                     \n"

-    "vadd.s16   q0, q0, q1                     \n"

-    "vabs.s16   q0, q0                         \n"

-    "vqmovn.u16 d0, q0                         \n"

-    MEMACCESS(2)

-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely

-    "bgt        1b                             \n"

-  : "+r"(src_y0),      // %0

-    "+r"(src_y1),      // %1

-    "+r"(dst_sobely),  // %2

-    "+r"(width)        // %3

-  : "r"(1),            // %4

-    "r"(6)             // %5

-  : "cc", "memory", "q0", "q1"  // Clobber List

-  );

+void SobelYRow_NEON(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    uint8_t* dst_sobely,

+                    int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {d0}, [%0],%4                  \n"  // left

+      "vld1.8     {d1}, [%1],%4                  \n"

+      "vsubl.u8   q0, d0, d1                     \n"

+      "vld1.8     {d2}, [%0],%4                  \n"  // center * 2

+      "vld1.8     {d3}, [%1],%4                  \n"

+      "vsubl.u8   q1, d2, d3                     \n"

+      "vadd.s16   q0, q0, q1                     \n"

+      "vadd.s16   q0, q0, q1                     \n"

+      "vld1.8     {d2}, [%0],%5                  \n"  // right

+      "vld1.8     {d3}, [%1],%5                  \n"

+      "subs       %3, %3, #8                     \n"  // 8 pixels

+      "vsubl.u8   q1, d2, d3                     \n"

+      "vadd.s16   q0, q0, q1                     \n"

+      "vabs.s16   q0, q0                         \n"

+      "vqmovn.u16 d0, q0                         \n"

+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely

+      "bgt        1b                             \n"

+      : "+r"(src_y0),               // %0

+        "+r"(src_y1),               // %1

+        "+r"(dst_sobely),           // %2

+        "+r"(width)                 // %3

+      : "r"(1),                     // %4

+        "r"(6)                      // %5

+      : "cc", "memory", "q0", "q1"  // Clobber List

+      );

-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)

+// %y passes a float as a scalar vector for vector * scalar multiply.

+// the regoster must be d0 to d15 and indexed with [0] or [1] to access

+// the float in the first or second float of the d-reg

+void HalfFloat1Row_NEON(const uint16_t* src,

+                        uint16_t* dst,

+                        float /*unused*/,

+                        int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts

+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop

+      "vmovl.u16  q2, d2                         \n"  // 8 int's

+      "vmovl.u16  q3, d3                         \n"

+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats

+      "vcvt.f32.u32  q3, q3                      \n"

+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent

+      "vmul.f32   q3, q3, %y3                    \n"

+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat

+      "vqshrn.u32 d3, q3, #13                    \n"

+      "vst1.8     {q1}, [%1]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src),              // %0

+        "+r"(dst),              // %1

+        "+r"(width)             // %2

+      : "w"(1.9259299444e-34f)  // %3

+      : "cc", "memory", "q1", "q2", "q3");

+}

+void HalfFloatRow_NEON(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts

+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop

+      "vmovl.u16  q2, d2                         \n"  // 8 int's

+      "vmovl.u16  q3, d3                         \n"

+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats

+      "vcvt.f32.u32  q3, q3                      \n"

+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent

+      "vmul.f32   q3, q3, %y3                    \n"

+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat

+      "vqshrn.u32 d3, q3, #13                    \n"

+      "vst1.8     {q1}, [%1]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src),                      // %0

+        "+r"(dst),                      // %1

+        "+r"(width)                     // %2

+      : "w"(scale * 1.9259299444e-34f)  // %3

+      : "cc", "memory", "q1", "q2", "q3");

+}

+void ByteToFloatRow_NEON(const uint8_t* src,

+                         float* dst,

+                         float scale,

+                         int width) {

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes

+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop

+      "vmovl.u8   q1, d2                         \n"  // 8 shorts

+      "vmovl.u16  q2, d2                         \n"  // 8 ints

+      "vmovl.u16  q3, d3                         \n"

+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats

+      "vcvt.f32.u32  q3, q3                      \n"

+      "vmul.f32   q2, q2, %y3                    \n"  // scale

+      "vmul.f32   q3, q3, %y3                    \n"

+      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats

+      "bgt        1b                             \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      : "w"(scale)   // %3

+      : "cc", "memory", "q1", "q2", "q3");

+}

+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..

 #ifdef __cplusplus

 }  // extern "C"

--- a/third_party/libyuv/source/row_neon64.cc

+++ b/third_party/libyuv/source/row_neon64.cc

@@ -19,118 +19,103 @@

 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 // Read 8 Y, 4 U and 4 V from 422

-#define READYUV422                                                             \

-    MEMACCESS(0)                                                               \

-    "ld1        {v0.8b}, [%0], #8              \n"                             \

-    MEMACCESS(1)                                                               \

-    "ld1        {v1.s}[0], [%1], #4            \n"                             \

-    MEMACCESS(2)                                                               \

-    "ld1        {v1.s}[1], [%2], #4            \n"

+#define READYUV422                               \

+  "ld1        {v0.8b}, [%0], #8              \n" \

+  "ld1        {v1.s}[0], [%1], #4            \n" \

+  "ld1        {v1.s}[1], [%2], #4            \n"

-// Read 8 Y, 2 U and 2 V from 422

-#define READYUV411                                                             \

-    MEMACCESS(0)                                                               \

-    "ld1        {v0.8b}, [%0], #8              \n"                             \

-    MEMACCESS(1)                                                               \

-    "ld1        {v2.h}[0], [%1], #2            \n"                             \

-    MEMACCESS(2)                                                               \

-    "ld1        {v2.h}[1], [%2], #2            \n"                             \

-    "zip1       v1.8b, v2.8b, v2.8b            \n"

 // Read 8 Y, 8 U and 8 V from 444

-#define READYUV444                                                             \

-    MEMACCESS(0)                                                               \

-    "ld1        {v0.8b}, [%0], #8              \n"                             \

-    MEMACCESS(1)                                                               \

-    "ld1        {v1.d}[0], [%1], #8            \n"                             \

-    MEMACCESS(2)                                                               \

-    "ld1        {v1.d}[1], [%2], #8            \n"                             \

-    "uaddlp     v1.8h, v1.16b                  \n"                             \

-    "rshrn      v1.8b, v1.8h, #1               \n"

+#define READYUV444                               \

+  "ld1        {v0.8b}, [%0], #8              \n" \

+  "ld1        {v1.d}[0], [%1], #8            \n" \

+  "ld1        {v1.d}[1], [%2], #8            \n" \

+  "uaddlp     v1.8h, v1.16b                  \n" \

+  "rshrn      v1.8b, v1.8h, #1               \n"

 // Read 8 Y, and set 4 U and 4 V to 128

-#define READYUV400                                                             \

-    MEMACCESS(0)                                                               \

-    "ld1        {v0.8b}, [%0], #8              \n"                             \

-    "movi       v1.8b , #128                   \n"

+#define READYUV400                               \

+  "ld1        {v0.8b}, [%0], #8              \n" \

+  "movi       v1.8b , #128                   \n"

 // Read 8 Y and 4 UV from NV12

-#define READNV12                                                               \

-    MEMACCESS(0)                                                               \

-    "ld1        {v0.8b}, [%0], #8              \n"                             \

-    MEMACCESS(1)                                                               \

-    "ld1        {v2.8b}, [%1], #8              \n"                             \

-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \

-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \

-    "ins        v1.s[1], v3.s[0]               \n"

+#define READNV12                                 \

+  "ld1        {v0.8b}, [%0], #8              \n" \

+  "ld1        {v2.8b}, [%1], #8              \n" \

+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \

+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \

+  "ins        v1.s[1], v3.s[0]               \n"

 // Read 8 Y and 4 VU from NV21

-#define READNV21                                                               \

-    MEMACCESS(0)                                                               \

-    "ld1        {v0.8b}, [%0], #8              \n"                             \

-    MEMACCESS(1)                                                               \

-    "ld1        {v2.8b}, [%1], #8              \n"                             \

-    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \

-    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \

-    "ins        v1.s[1], v3.s[0]               \n"

+#define READNV21                                 \

+  "ld1        {v0.8b}, [%0], #8              \n" \

+  "ld1        {v2.8b}, [%1], #8              \n" \

+  "uzp1       v3.8b, v2.8b, v2.8b            \n" \

+  "uzp2       v1.8b, v2.8b, v2.8b            \n" \

+  "ins        v1.s[1], v3.s[0]               \n"

 // Read 8 YUY2

-#define READYUY2                                                               \

-    MEMACCESS(0)                                                               \

-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \

-    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \

-    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \

-    "ins        v1.s[1], v3.s[0]               \n"

+#define READYUY2                                 \

+  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \

+  "uzp2       v3.8b, v1.8b, v1.8b            \n" \

+  "uzp1       v1.8b, v1.8b, v1.8b            \n" \

+  "ins        v1.s[1], v3.s[0]               \n"

 // Read 8 UYVY

-#define READUYVY                                                               \

-    MEMACCESS(0)                                                               \

-    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \

-    "orr        v0.8b, v3.8b, v3.8b            \n"                             \

-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \

-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \

-    "ins        v1.s[1], v3.s[0]               \n"

+#define READUYVY                                 \

+  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \

+  "orr        v0.8b, v3.8b, v3.8b            \n" \

+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \

+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \

+  "ins        v1.s[1], v3.s[0]               \n"

-#define YUVTORGB_SETUP                                                         \

-    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \

-    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \

-    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \

-    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \

-    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \

-    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"

+#define YUVTORGB_SETUP                           \

+  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \

+  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \

+  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \

+  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \

+  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \

+  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"

-#define YUVTORGB(vR, vG, vB)                                                   \

-    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \

-    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \

-    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \

-    "ushll      v0.4s, v0.4h, #0               \n"                             \

-    "mul        v3.4s, v3.4s, v31.4s           \n"                             \

-    "mul        v0.4s, v0.4s, v31.4s           \n"                             \

-    "sqshrun    v0.4h, v0.4s, #16              \n"                             \

-    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \

-    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \

-    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \

-    "uxtl       v2.8h, v2.8b                   \n"                             \

-    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \

-    "mul        v3.8h, v1.8h, v27.8h           \n"                             \

-    "mul        v5.8h, v1.8h, v29.8h           \n"                             \

-    "mul        v6.8h, v2.8h, v30.8h           \n"                             \

-    "mul        v7.8h, v2.8h, v28.8h           \n"                             \

-    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \

-    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \

-    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \

-    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \

-    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \

-    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \

-    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \

-    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \

-    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \

-    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \

+#define YUVTORGB(vR, vG, vB)                                        \

+  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \

+  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \

+  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \

+  "ushll      v0.4s, v0.4h, #0               \n"                    \

+  "mul        v3.4s, v3.4s, v31.4s           \n"                    \

+  "mul        v0.4s, v0.4s, v31.4s           \n"                    \

+  "sqshrun    v0.4h, v0.4s, #16              \n"                    \

+  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \

+  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \

+  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \

+  "uxtl       v2.8h, v2.8b                   \n"                    \

+  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \

+  "mul        v3.8h, v1.8h, v27.8h           \n"                    \

+  "mul        v5.8h, v1.8h, v29.8h           \n"                    \

+  "mul        v6.8h, v2.8h, v30.8h           \n"                    \

+  "mul        v7.8h, v2.8h, v28.8h           \n"                    \

+  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \

+  "sqadd      " #vB                                                 \

+  ".8h, v24.8h, v0.8h      \n" /* B */                              \

+  "sqadd      " #vG                                                 \

+  ".8h, v25.8h, v0.8h      \n" /* G */                              \

+  "sqadd      " #vR                                                 \

+  ".8h, v26.8h, v0.8h      \n" /* R */                              \

+  "sqadd      " #vB ".8h, " #vB                                     \

+  ".8h, v3.8h  \n" /* B */                                          \

+  "sqsub      " #vG ".8h, " #vG                                     \

+  ".8h, v6.8h  \n" /* G */                                          \

+  "sqadd      " #vR ".8h, " #vR                                     \

+  ".8h, v7.8h  \n" /* R */                                          \

+  "sqshrun    " #vB ".8b, " #vB                                     \

+  ".8h, #6     \n" /* B */                                          \

+  "sqshrun    " #vG ".8b, " #vG                                     \

+  ".8h, #6     \n"                               /* G */            \

+  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */

-void I444ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I444ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

@@ -140,7 +125,6 @@

     READYUV444

     YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

-    MEMACCESS(3)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -157,10 +141,10 @@

);

-void I422ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

@@ -170,7 +154,6 @@

     READYUV422

     YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

-    MEMACCESS(3)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -187,11 +170,11 @@

);

-void I422AlphaToARGBRow_NEON(const uint8* src_y,

-                             const uint8* src_u,

-                             const uint8* src_v,

-                             const uint8* src_a,

-                             uint8* dst_argb,

+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,

+                             const uint8_t* src_u,

+                             const uint8_t* src_v,

+                             const uint8_t* src_a,

+                             uint8_t* dst_argb,

                              const struct YuvConstants* yuvconstants,

                              int width) {

   asm volatile (

@@ -199,10 +182,8 @@

   "1:                                          \n"

     READYUV422

     YUVTORGB(v22, v21, v20)

-    MEMACCESS(3)

     "ld1        {v23.8b}, [%3], #8             \n"

     "subs       %w5, %w5, #8                   \n"

-    MEMACCESS(4)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -220,50 +201,19 @@

);

-void I411ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_argb,

+void I422ToRGBARow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_rgba,

                         const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

     YUVTORGB_SETUP

-    "movi       v23.8b, #255                   \n" /* A */

-  "1:                                          \n"

-    READYUV411

-    YUVTORGB(v22, v21, v20)

-    "subs       %w4, %w4, #8                   \n"

-    MEMACCESS(3)

-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

-    "b.gt       1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(src_u),     // %1

-      "+r"(src_v),     // %2

-      "+r"(dst_argb),  // %3

-      "+r"(width)      // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

-  );

-}

-void I422ToRGBARow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_rgba,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

-  asm volatile (

-    YUVTORGB_SETUP

     "movi       v20.8b, #255                   \n" /* A */

   "1:                                          \n"

     READYUV422

     YUVTORGB(v23, v22, v21)

     "subs       %w4, %w4, #8                   \n"

-    MEMACCESS(3)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -280,10 +230,10 @@

);

-void I422ToRGB24Row_NEON(const uint8* src_y,

-                         const uint8* src_u,

-                         const uint8* src_v,

-                         uint8* dst_rgb24,

+void I422ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_u,

+                         const uint8_t* src_v,

+                         uint8_t* dst_rgb24,

                          const struct YuvConstants* yuvconstants,

                          int width) {

   asm volatile (

@@ -292,7 +242,6 @@

     READYUV422

     YUVTORGB(v22, v21, v20)

     "subs       %w4, %w4, #8                   \n"

-    MEMACCESS(3)

     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -309,97 +258,91 @@

);

-#define ARGBTORGB565                                                           \

-    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \

-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \

-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \

-    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \

-    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */

+#define ARGBTORGB565                                                        \

+  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \

+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \

+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \

+  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \

+  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */

-void I422ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_u,

-                          const uint8* src_v,

-                          uint8* dst_rgb565,

+void I422ToRGB565Row_NEON(const uint8_t* src_y,

+                          const uint8_t* src_u,

+                          const uint8_t* src_v,

+                          uint8_t* dst_rgb565,

                           const struct YuvConstants* yuvconstants,

                           int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB(v22, v21, v20)

-    "subs       %w4, %w4, #8                   \n"

-    ARGBTORGB565

-    MEMACCESS(3)

-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.

-    "b.gt       1b                             \n"

-    : "+r"(src_y),    // %0

-      "+r"(src_u),    // %1

-      "+r"(src_v),    // %2

-      "+r"(dst_rgb565),  // %3

-      "+r"(width)     // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READYUV422 YUVTORGB(

+          v22, v21,

+          v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565

+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels

+                                                               // RGB565.

+               "b.gt       1b                             \n"

+      : "+r"(src_y),       // %0

+        "+r"(src_u),       // %1

+        "+r"(src_v),       // %2

+        "+r"(dst_rgb565),  // %3

+        "+r"(width)        // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");

-#define ARGBTOARGB1555                                                         \

-    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \

-    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \

-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \

-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \

-    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \

-    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \

-    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */

+#define ARGBTOARGB1555                                                      \

+  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \

+  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \

+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \

+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \

+  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \

+  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \

+  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */

-void I422ToARGB1555Row_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb1555,

+void I422ToARGB1555Row_NEON(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb1555,

                             const struct YuvConstants* yuvconstants,

                             int width) {

-  asm volatile (

-    YUVTORGB_SETUP

-    "movi       v23.8b, #255                   \n"

-  "1:                                          \n"

-    READYUV422

-    YUVTORGB(v22, v21, v20)

-    "subs       %w4, %w4, #8                   \n"

-    ARGBTOARGB1555

-    MEMACCESS(3)

-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.

-    "b.gt       1b                             \n"

-    : "+r"(src_y),    // %0

-      "+r"(src_u),    // %1

-      "+r"(src_v),    // %2

-      "+r"(dst_argb1555),  // %3

-      "+r"(width)     // %4

-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

-      [kUVToG]"r"(&yuvconstants->kUVToG),

-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

-      [kYToRgb]"r"(&yuvconstants->kYToRgb)

-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

-  );

+  asm volatile(

+      YUVTORGB_SETUP

+      "movi       v23.8b, #255                   \n"

+      "1:                                        \n" READYUV422 YUVTORGB(

+          v22, v21,

+          v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555

+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels

+                                                               // RGB565.

+               "b.gt       1b                             \n"

+      : "+r"(src_y),         // %0

+        "+r"(src_u),         // %1

+        "+r"(src_v),         // %2

+        "+r"(dst_argb1555),  // %3

+        "+r"(width)          // %4

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");

-#define ARGBTOARGB4444                                                         \

-    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \

-    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \

-    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \

-    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \

-    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \

-    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \

-    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \

-    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */

+#define ARGBTOARGB4444                                                       \

+  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \

+  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \

+  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \

+  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \

+  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \

+  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \

+  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \

+  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */

-void I422ToARGB4444Row_NEON(const uint8* src_y,

-                            const uint8* src_u,

-                            const uint8* src_v,

-                            uint8* dst_argb4444,

+void I422ToARGB4444Row_NEON(const uint8_t* src_y,

+                            const uint8_t* src_u,

+                            const uint8_t* src_v,

+                            uint8_t* dst_argb4444,

                             const struct YuvConstants* yuvconstants,

                             int width) {

   asm volatile (

@@ -411,7 +354,6 @@

     "subs       %w4, %w4, #8                   \n"

     "movi       v23.8b, #255                   \n"

     ARGBTOARGB4444

-    MEMACCESS(3)

     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.

     "b.gt       1b                             \n"

     : "+r"(src_y),    // %0

@@ -428,9 +370,7 @@

);

-void I400ToARGBRow_NEON(const uint8* src_y,

-                        uint8* dst_argb,

-                        int width) {

+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {

   asm volatile (

     YUVTORGB_SETUP

     "movi       v23.8b, #255                   \n"

@@ -438,7 +378,6 @@

     READYUV400

     YUVTORGB(v22, v21, v20)

     "subs       %w2, %w2, #8                   \n"

-    MEMACCESS(1)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -453,31 +392,26 @@

);

-void J400ToARGBRow_NEON(const uint8* src_y,

-                        uint8* dst_argb,

-                        int width) {

-  asm volatile (

-    "movi       v23.8b, #255                   \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v20.8b}, [%0], #8             \n"

-    "orr        v21.8b, v20.8b, v20.8b         \n"

-    "orr        v22.8b, v20.8b, v20.8b         \n"

-    "subs       %w2, %w2, #8                   \n"

-    MEMACCESS(1)

-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"

-    "b.gt       1b                             \n"

-    : "+r"(src_y),     // %0

-      "+r"(dst_argb),  // %1

-      "+r"(width)      // %2

-    :

-    : "cc", "memory", "v20", "v21", "v22", "v23"

-  );

+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "movi       v23.8b, #255                   \n"

+      "1:                                        \n"

+      "ld1        {v20.8b}, [%0], #8             \n"

+      "orr        v21.8b, v20.8b, v20.8b         \n"

+      "orr        v22.8b, v20.8b, v20.8b         \n"

+      "subs       %w2, %w2, #8                   \n"

+      "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v20", "v21", "v22", "v23");

-void NV12ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_uv,

-                        uint8* dst_argb,

+void NV12ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_uv,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

@@ -487,7 +421,6 @@

     READNV12

     YUVTORGB(v22, v21, v20)

     "subs       %w3, %w3, #8                   \n"

-    MEMACCESS(2)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -503,9 +436,9 @@

);

-void NV21ToARGBRow_NEON(const uint8* src_y,

-                        const uint8* src_vu,

-                        uint8* dst_argb,

+void NV21ToARGBRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_vu,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

@@ -515,7 +448,6 @@

     READNV21

     YUVTORGB(v22, v21, v20)

     "subs       %w3, %w3, #8                   \n"

-    MEMACCESS(2)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

@@ -531,11 +463,11 @@

);

-void NV12ToRGB565Row_NEON(const uint8* src_y,

-                          const uint8* src_uv,

-                          uint8* dst_rgb565,

-                          const struct YuvConstants* yuvconstants,

-                          int width) {

+void NV12ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_uv,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

   asm volatile (

     YUVTORGB_SETUP

   "1:                                          \n"

@@ -542,13 +474,11 @@

     READNV12

     YUVTORGB(v22, v21, v20)

     "subs       %w3, %w3, #8                   \n"

-    ARGBTORGB565

-    MEMACCESS(2)

-    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.

+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"

     "b.gt       1b                             \n"

     : "+r"(src_y),     // %0

       "+r"(src_uv),    // %1

-      "+r"(dst_rgb565),  // %2

+      "+r"(dst_rgb24),  // %2

       "+r"(width)      // %3

     : [kUVToRB]"r"(&yuvconstants->kUVToRB),

       [kUVToG]"r"(&yuvconstants->kUVToG),

@@ -559,8 +489,59 @@

);

-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,

-                        uint8* dst_argb,

+void NV21ToRGB24Row_NEON(const uint8_t* src_y,

+                         const uint8_t* src_vu,

+                         uint8_t* dst_rgb24,

+                         const struct YuvConstants* yuvconstants,

+                         int width) {

+  asm volatile (

+    YUVTORGB_SETUP

+  "1:                                          \n"

+    READNV21

+    YUVTORGB(v22, v21, v20)

+    "subs       %w3, %w3, #8                   \n"

+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"

+    "b.gt       1b                             \n"

+    : "+r"(src_y),     // %0

+      "+r"(src_vu),    // %1

+      "+r"(dst_rgb24),  // %2

+      "+r"(width)      // %3

+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),

+      [kUVToG]"r"(&yuvconstants->kUVToG),

+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),

+      [kYToRgb]"r"(&yuvconstants->kYToRgb)

+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"

+  );

+}

+void NV12ToRGB565Row_NEON(const uint8_t* src_y,

+                          const uint8_t* src_uv,

+                          uint8_t* dst_rgb565,

+                          const struct YuvConstants* yuvconstants,

+                          int width) {

+  asm volatile(

+      YUVTORGB_SETUP

+      "1:                                        \n" READNV12 YUVTORGB(

+          v22, v21,

+          v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565

+               "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels

+                                                               // RGB565.

+               "b.gt       1b                             \n"

+      : "+r"(src_y),       // %0

+        "+r"(src_uv),      // %1

+        "+r"(dst_rgb565),  // %2

+        "+r"(width)        // %3

+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),

+        [kUVToG] "r"(&yuvconstants->kUVToG),

+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),

+        [kYToRgb] "r"(&yuvconstants->kYToRgb)

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");

+}

+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

@@ -570,7 +551,6 @@

     READYUY2

     YUVTORGB(v22, v21, v20)

     "subs       %w2, %w2, #8                   \n"

-    MEMACCESS(1)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"

     "b.gt       1b                             \n"

     : "+r"(src_yuy2),  // %0

@@ -585,8 +565,8 @@

);

-void UYVYToARGBRow_NEON(const uint8* src_uyvy,

-                        uint8* dst_argb,

+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,

+                        uint8_t* dst_argb,

                         const struct YuvConstants* yuvconstants,

                         int width) {

   asm volatile (

@@ -596,7 +576,6 @@

     READUYVY

     YUVTORGB(v22, v21, v20)

     "subs       %w2, %w2, #8                   \n"

-    MEMACCESS(1)

     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"

     "b.gt       1b                             \n"

     : "+r"(src_uyvy),  // %0

@@ -612,869 +591,819 @@

 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.

-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void SplitUVRow_NEON(const uint8_t* src_uv,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

                      int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV

-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop

-    MEMACCESS(1)

-    "st1        {v0.16b}, [%1], #16            \n"  // store U

-    MEMACCESS(2)

-    "st1        {v1.16b}, [%2], #16            \n"  // store V

-    "b.gt       1b                             \n"

-    : "+r"(src_uv),  // %0

-      "+r"(dst_u),   // %1

-      "+r"(dst_v),   // %2

-      "+r"(width)    // %3  // Output registers

-    :                       // Input registers

-    : "cc", "memory", "v0", "v1"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV

+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop

+      "st1        {v0.16b}, [%1], #16            \n"  // store U

+      "st1        {v1.16b}, [%2], #16            \n"  // store V

+      "b.gt       1b                             \n"

+      : "+r"(src_uv),               // %0

+        "+r"(dst_u),                // %1

+        "+r"(dst_v),                // %2

+        "+r"(width)                 // %3  // Output registers

+      :                             // Input registers

+      : "cc", "memory", "v0", "v1"  // Clobber List

+      );

 // Reads 16 U's and V's and writes out 16 pairs of UV.

-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

+void MergeUVRow_NEON(const uint8_t* src_u,

+                     const uint8_t* src_v,

+                     uint8_t* dst_uv,

                      int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load U

-    MEMACCESS(1)

-    "ld1        {v1.16b}, [%1], #16            \n"  // load V

-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop

-    MEMACCESS(2)

-    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV

-    "b.gt       1b                             \n"

-    :

-      "+r"(src_u),   // %0

-      "+r"(src_v),   // %1

-      "+r"(dst_uv),  // %2

-      "+r"(width)    // %3  // Output registers

-    :                       // Input registers

-    : "cc", "memory", "v0", "v1"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load U

+      "ld1        {v1.16b}, [%1], #16            \n"  // load V

+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop

+      "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV

+      "b.gt       1b                             \n"

+      : "+r"(src_u),                // %0

+        "+r"(src_v),                // %1

+        "+r"(dst_uv),               // %2

+        "+r"(width)                 // %3  // Output registers

+      :                             // Input registers

+      : "cc", "memory", "v0", "v1"  // Clobber List

+      );

-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.

-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32

-    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop

-    MEMACCESS(1)

-    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32

-    "b.gt       1b                             \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(count)  // %2  // Output registers

-  :                     // Input registers

-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.

+void SplitRGBRow_NEON(const uint8_t* src_rgb,

+                      uint8_t* dst_r,

+                      uint8_t* dst_g,

+                      uint8_t* dst_b,

+                      int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB

+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop

+      "st1        {v0.16b}, [%1], #16            \n"  // store R

+      "st1        {v1.16b}, [%2], #16            \n"  // store G

+      "st1        {v2.16b}, [%3], #16            \n"  // store B

+      "b.gt       1b                             \n"

+      : "+r"(src_rgb),                    // %0

+        "+r"(dst_r),                      // %1

+        "+r"(dst_g),                      // %2

+        "+r"(dst_b),                      // %3

+        "+r"(width)                       // %4

+      :                                   // Input registers

+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List

+      );

-// SetRow writes 'count' bytes using an 8 bit value repeated.

-void SetRow_NEON(uint8* dst, uint8 v8, int count) {

-  asm volatile (

-    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes

-  "1:                                          \n"

-    "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"  // store

-    "b.gt       1b                             \n"

-  : "+r"(dst),   // %0

-    "+r"(count)  // %1

-  : "r"(v8)      // %2

-  : "cc", "memory", "v0"

-  );

+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time

+void MergeRGBRow_NEON(const uint8_t* src_r,

+                      const uint8_t* src_g,

+                      const uint8_t* src_b,

+                      uint8_t* dst_rgb,

+                      int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load R

+      "ld1        {v1.16b}, [%1], #16            \n"  // load G

+      "ld1        {v2.16b}, [%2], #16            \n"  // load B

+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop

+      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB

+      "b.gt       1b                             \n"

+      : "+r"(src_r),                      // %0

+        "+r"(src_g),                      // %1

+        "+r"(src_b),                      // %2

+        "+r"(dst_rgb),                    // %3

+        "+r"(width)                       // %4

+      :                                   // Input registers

+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List

+      );

-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {

-  asm volatile (

-    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints

-  "1:                                          \n"

-    "subs       %w1, %w1, #4                   \n"  // 4 ints per loop

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"  // store

-    "b.gt       1b                             \n"

-  : "+r"(dst),   // %0

-    "+r"(count)  // %1

-  : "r"(v32)     // %2

-  : "cc", "memory", "v0"

-  );

+// Copy multiple of 32.

+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ldp        q0, q1, [%0], #32              \n"

+      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop

+      "stp        q0, q1, [%1], #32              \n"

+      "b.gt       1b                             \n"

+      : "+r"(src),                  // %0

+        "+r"(dst),                  // %1

+        "+r"(width)                 // %2  // Output registers

+      :                             // Input registers

+      : "cc", "memory", "v0", "v1"  // Clobber List

+      );

-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-    // Start at end of source row.

-    "add        %0, %0, %w2, sxtw              \n"

-    "sub        %0, %0, #16                    \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16

-    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.

-    "rev64      v0.16b, v0.16b                 \n"

-    MEMACCESS(1)

-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16

-    MEMACCESS(1)

-    "st1        {v0.D}[0], [%1], #8            \n"

-    "b.gt       1b                             \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  : "r"((ptrdiff_t)-16)    // %3

-  : "cc", "memory", "v0"

-  );

+// SetRow writes 'width' bytes using an 8 bit value repeated.

+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {

+  asm volatile(

+      "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes

+      "1:                                        \n"

+      "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop

+      "st1        {v0.16b}, [%0], #16            \n"  // store

+      "b.gt       1b                             \n"

+      : "+r"(dst),   // %0

+        "+r"(width)  // %1

+      : "r"(v8)      // %2

+      : "cc", "memory", "v0");

-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {

+  asm volatile(

+      "dup        v0.4s, %w2                     \n"  // duplicate 4 ints

+      "1:                                        \n"

+      "subs       %w1, %w1, #4                   \n"  // 4 ints per loop

+      "st1        {v0.16b}, [%0], #16            \n"  // store

+      "b.gt       1b                             \n"

+      : "+r"(dst),   // %0

+        "+r"(width)  // %1

+      : "r"(v32)     // %2

+      : "cc", "memory", "v0");

+}

+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      // Start at end of source row.

+      "add        %0, %0, %w2, sxtw              \n"

+      "sub        %0, %0, #16                    \n"

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16

+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.

+      "rev64      v0.16b, v0.16b                 \n"

+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16

+      "st1        {v0.D}[0], [%1], #8            \n"

+      "b.gt       1b                             \n"

+      : "+r"(src),           // %0

+        "+r"(dst),           // %1

+        "+r"(width)          // %2

+      : "r"((ptrdiff_t)-16)  // %3

+      : "cc", "memory", "v0");

+}

+void MirrorUVRow_NEON(const uint8_t* src_uv,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

                       int width) {

-  asm volatile (

-    // Start at end of source row.

-    "add        %0, %0, %w3, sxtw #1           \n"

-    "sub        %0, %0, #16                    \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16

-    "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.

-    "rev64      v0.8b, v0.8b                   \n"

-    "rev64      v1.8b, v1.8b                   \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8

-    MEMACCESS(2)

-    "st1        {v1.8b}, [%2], #8              \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_uv),  // %0

-    "+r"(dst_u),   // %1

-    "+r"(dst_v),   // %2

-    "+r"(width)    // %3

-  : "r"((ptrdiff_t)-16)      // %4

-  : "cc", "memory", "v0", "v1"

-  );

+  asm volatile(

+      // Start at end of source row.

+      "add        %0, %0, %w3, sxtw #1           \n"

+      "sub        %0, %0, #16                    \n"

+      "1:                                        \n"

+      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16

+      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.

+      "rev64      v0.8b, v0.8b                   \n"

+      "rev64      v1.8b, v1.8b                   \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8

+      "st1        {v1.8b}, [%2], #8              \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_uv),        // %0

+        "+r"(dst_u),         // %1

+        "+r"(dst_v),         // %2

+        "+r"(width)          // %3

+      : "r"((ptrdiff_t)-16)  // %4

+      : "cc", "memory", "v0", "v1");

-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {

-  asm volatile (

-  // Start at end of source row.

-    "add        %0, %0, %w2, sxtw #2           \n"

-    "sub        %0, %0, #16                    \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16

-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.

-    "rev64      v0.4s, v0.4s                   \n"

-    MEMACCESS(1)

-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16

-    MEMACCESS(1)

-    "st1        {v0.D}[0], [%1], #8            \n"

-    "b.gt       1b                             \n"

-  : "+r"(src),   // %0

-    "+r"(dst),   // %1

-    "+r"(width)  // %2

-  : "r"((ptrdiff_t)-16)    // %3

-  : "cc", "memory", "v0"

-  );

+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {

+  asm volatile(

+      // Start at end of source row.

+      "add        %0, %0, %w2, sxtw #2           \n"

+      "sub        %0, %0, #16                    \n"

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16

+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.

+      "rev64      v0.4s, v0.4s                   \n"

+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16

+      "st1        {v0.D}[0], [%1], #8            \n"

+      "b.gt       1b                             \n"

+      : "+r"(src),           // %0

+        "+r"(dst),           // %1

+        "+r"(width)          // %2

+      : "r"((ptrdiff_t)-16)  // %3

+      : "cc", "memory", "v0");

-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {

-  asm volatile (

-    "movi       v4.8b, #255                    \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    MEMACCESS(1)

-    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_rgb24),  // %0

-    "+r"(dst_argb),   // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List

-  );

+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,

+                         uint8_t* dst_argb,

+                         int width) {

+  asm volatile(

+      "movi       v4.8b, #255                    \n"  // Alpha

+      "1:                                        \n"

+      "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_rgb24),  // %0

+        "+r"(dst_argb),   // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List

+      );

-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {

-  asm volatile (

-    "movi       v5.8b, #255                    \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g

-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r

-    MEMACCESS(1)

-    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a

-    "b.gt       1b                             \n"

-  : "+r"(src_raw),   // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)      // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List

-  );

+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "movi       v5.8b, #255                    \n"  // Alpha

+      "1:                                        \n"

+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g

+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r

+      "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a

+      "b.gt       1b                             \n"

+      : "+r"(src_raw),   // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List

+      );

-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g

-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r

-    MEMACCESS(1)

-    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r

-    "b.gt       1b                             \n"

-  : "+r"(src_raw),    // %0

-    "+r"(dst_rgb24),  // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List

-  );

+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g

+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r

+      "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r

+      "b.gt       1b                             \n"

+      : "+r"(src_raw),    // %0

+        "+r"(dst_rgb24),  // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List

+      );

-#define RGB565TOARGB                                                           \

-    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \

-    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \

-    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \

-    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \

-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \

-    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \

-    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \

-    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \

-    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \

-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \

-    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */

+#define RGB565TOARGB                                                        \

+  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \

+  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \

+  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \

+  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \

+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \

+  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \

+  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \

+  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \

+  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \

+  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \

+  "dup        v2.2D, v0.D[1]                 \n" /* R                    */

-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {

-  asm volatile (

-    "movi       v3.8b, #255                    \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    RGB565TOARGB

-    MEMACCESS(1)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_rgb565),  // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List

-  );

+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      "movi       v3.8b, #255                    \n"  // Alpha

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      RGB565TOARGB

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_rgb565),  // %0

+        "+r"(dst_argb),    // %1

+        "+r"(width)        // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List

+      );

-#define ARGB1555TOARGB                                                         \

-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \

-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \

-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \

-                                                                               \

-    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \

-    "xtn2       v3.16b, v2.8h                  \n"                             \

-                                                                               \

-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \

-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \

-                                                                               \

-    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \

-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \

-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \

-                                                                               \

-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \

-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \

-    "dup        v1.2D, v0.D[1]                 \n"                             \

-    "dup        v3.2D, v2.D[1]                 \n"

+#define ARGB1555TOARGB                                                      \

+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \

+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \

+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \

+                                                                            \

+  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \

+  "xtn2       v3.16b, v2.8h                  \n"                            \

+                                                                            \

+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \

+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \

+                                                                            \

+  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \

+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \

+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \

+                                                                            \

+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \

+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \

+  "dup        v1.2D, v0.D[1]                 \n"                            \

+  "dup        v3.2D, v2.D[1]                 \n"

 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.

-#define RGB555TOARGB                                                           \

-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \

-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \

-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \

-                                                                               \

-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \

-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \

-                                                                               \

-    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \

-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \

-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \

-                                                                               \

-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \

-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \

-    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \

+#define RGB555TOARGB                                                        \

+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \

+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \

+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \

+                                                                            \

+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \

+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \

+                                                                            \

+  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \

+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \

+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \

+                                                                            \

+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \

+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \

+  "dup        v1.2D, v0.D[1]                 \n" /* G */

-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,

+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,

+                            uint8_t* dst_argb,

                             int width) {

-  asm volatile (

-    "movi       v3.8b, #255                    \n"  // Alpha

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    ARGB1555TOARGB

-    MEMACCESS(1)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_argb1555),  // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+  asm volatile(

+      "movi       v3.8b, #255                    \n"  // Alpha

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      ARGB1555TOARGB

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB

+                                                            // pixels

+      "b.gt       1b                             \n"

+      : "+r"(src_argb1555),  // %0

+        "+r"(dst_argb),      // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

+      );

-#define ARGB4444TOARGB                                                         \

-    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \

-    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \

-    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \

-    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \

-    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \

-    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \

-    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \

-    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \

-    "dup        v0.2D, v2.D[1]                 \n"                             \

-    "dup        v1.2D, v3.D[1]                 \n"

+#define ARGB4444TOARGB                                                      \

+  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \

+  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \

+  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \

+  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \

+  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \

+  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \

+  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \

+  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \

+  "dup        v0.2D, v2.D[1]                 \n"                            \

+  "dup        v1.2D, v3.D[1]                 \n"

-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,

+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,

+                            uint8_t* dst_argb,

                             int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    ARGB4444TOARGB

-    MEMACCESS(1)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_argb4444),  // %0

-    "+r"(dst_argb),    // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      ARGB4444TOARGB

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB

+                                                            // pixels

+      "b.gt       1b                             \n"

+      : "+r"(src_argb4444),  // %0

+        "+r"(dst_argb),      // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List

+      );

-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    MEMACCESS(1)

-    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_rgb24),  // %1

-    "+r"(width)         // %2

-  :

-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List

-  );

+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_rgb24,

+                         int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of

+                                                      // RGB24.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),   // %0

+        "+r"(dst_rgb24),  // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List

+      );

-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g

-    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b

-    MEMACCESS(1)

-    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_raw),   // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List

-  );

+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g

+      "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b

+      "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_raw),   // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List

+      );

-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.

-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.

-    MEMACCESS(1)

-    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_yuy2),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v1"  // Clobber List

-  );

+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.

+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.

+      "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1"  // Clobber List

+      );

-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.

-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.

-    MEMACCESS(1)

-    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_uyvy),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v1"  // Clobber List

-  );

+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.

+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.

+      "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1"  // Clobber List

+      );

-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,

+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels

-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.

-    MEMACCESS(2)

-    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.

-    "b.gt       1b                             \n"

-  : "+r"(src_yuy2),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2

+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.

+      "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.

+      "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.

+      "b.gt       1b                             \n"

+      : "+r"(src_yuy2),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

+      );

-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,

+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels

-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.

-    MEMACCESS(2)

-    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.

-    "b.gt       1b                             \n"

-  : "+r"(src_uyvy),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY

+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.

+      "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.

+      "b.gt       1b                             \n"

+      : "+r"(src_uyvy),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

+      );

-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels

-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row

-    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U

-    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V

-    MEMACCESS(2)

-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.

-    MEMACCESS(3)

-    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.

-    "b.gt       1b                             \n"

-  : "+r"(src_yuy2),     // %0

-    "+r"(src_yuy2b),    // %1

-    "+r"(dst_u),        // %2

-    "+r"(dst_v),        // %3

-    "+r"(width)           // %4

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",

-    "v5", "v6", "v7"  // Clobber List

-  );

+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,

+                      int stride_yuy2,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels

+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.

+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row

+      "urhadd     v1.8b, v1.8b, v5.8b            \n"        // average rows of U

+      "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V

+      "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.

+      "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.

+      "b.gt       1b                             \n"

+      : "+r"(src_yuy2),   // %0

+        "+r"(src_yuy2b),  // %1

+        "+r"(dst_u),      // %2

+        "+r"(dst_v),      // %3

+        "+r"(width)       // %4

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",

+        "v7"  // Clobber List

+      );

-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_uyvyb = src_uyvy + stride_uyvy;

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels

-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.

-    MEMACCESS(1)

-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row

-    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U

-    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V

-    MEMACCESS(2)

-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.

-    MEMACCESS(3)

-    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.

-    "b.gt       1b                             \n"

-  : "+r"(src_uyvy),     // %0

-    "+r"(src_uyvyb),    // %1

-    "+r"(dst_u),        // %2

-    "+r"(dst_v),        // %3

-    "+r"(width)           // %4

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",

-    "v5", "v6", "v7"  // Clobber List

-  );

+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,

+                      int stride_uyvy,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels

+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.

+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row

+      "urhadd     v0.8b, v0.8b, v4.8b            \n"        // average rows of U

+      "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V

+      "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.

+      "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.

+      "b.gt       1b                             \n"

+      : "+r"(src_uyvy),   // %0

+        "+r"(src_uyvyb),  // %1

+        "+r"(dst_u),      // %2

+        "+r"(dst_v),      // %3

+        "+r"(width)       // %4

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",

+        "v7"  // Clobber List

+      );

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width) {

-  asm volatile (

-    MEMACCESS(3)

-    "ld1        {v2.16b}, [%3]                 \n"  // shuffler

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.

-    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop

-    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels

-    MEMACCESS(1)

-    "st1        {v1.16b}, [%1], #16            \n"  // store 4.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)        // %2

-  : "r"(shuffler)    // %3

-  : "cc", "memory", "v0", "v1", "v2"  // Clobber List

-  );

+void ARGBShuffleRow_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_argb,

+                         const uint8_t* shuffler,

+                         int width) {

+  asm volatile(

+      "ld1        {v2.16b}, [%3]                 \n"  // shuffler

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.

+      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop

+      "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels

+      "st1        {v1.16b}, [%1], #16            \n"  // store 4.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),                   // %0

+        "+r"(dst_argb),                   // %1

+        "+r"(width)                       // %2

+      : "r"(shuffler)                     // %3

+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List

+      );

-void I422ToYUY2Row_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_yuy2, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys

-    "orr        v2.8b, v1.8b, v1.8b            \n"

-    MEMACCESS(1)

-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us

-    MEMACCESS(2)

-    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs

-    "subs       %w4, %w4, #16                  \n"  // 16 pixels

-    MEMACCESS(3)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.

-    "b.gt       1b                             \n"

-  : "+r"(src_y),     // %0

-    "+r"(src_u),     // %1

-    "+r"(src_v),     // %2

-    "+r"(dst_yuy2),  // %3

-    "+r"(width)      // %4

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"

-  );

+void I422ToYUY2Row_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_yuy2,

+                        int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys

+      "orr        v2.8b, v1.8b, v1.8b            \n"

+      "ld1        {v1.8b}, [%1], #8              \n"        // load 8 Us

+      "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs

+      "subs       %w4, %w4, #16                  \n"        // 16 pixels

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.

+      "b.gt       1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_yuy2),  // %3

+        "+r"(width)      // %4

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3");

-void I422ToUYVYRow_NEON(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_uyvy, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys

-    "orr        v3.8b, v2.8b, v2.8b            \n"

-    MEMACCESS(1)

-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us

-    MEMACCESS(2)

-    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs

-    "subs       %w4, %w4, #16                  \n"  // 16 pixels

-    MEMACCESS(3)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.

-    "b.gt       1b                             \n"

-  : "+r"(src_y),     // %0

-    "+r"(src_u),     // %1

-    "+r"(src_v),     // %2

-    "+r"(dst_uyvy),  // %3

-    "+r"(width)      // %4

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"

-  );

+void I422ToUYVYRow_NEON(const uint8_t* src_y,

+                        const uint8_t* src_u,

+                        const uint8_t* src_v,

+                        uint8_t* dst_uyvy,

+                        int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys

+      "orr        v3.8b, v2.8b, v2.8b            \n"

+      "ld1        {v0.8b}, [%1], #8              \n"        // load 8 Us

+      "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs

+      "subs       %w4, %w4, #16                  \n"        // 16 pixels

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.

+      "b.gt       1b                             \n"

+      : "+r"(src_y),     // %0

+        "+r"(src_u),     // %1

+        "+r"(src_v),     // %2

+        "+r"(dst_uyvy),  // %3

+        "+r"(width)      // %4

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3");

-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    ARGBTORGB565

-    MEMACCESS(1)

-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_rgb565),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"

-  );

+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,

+                          uint8_t* dst_rgb565,

+                          int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      ARGBTORGB565

+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),    // %0

+        "+r"(dst_rgb565),  // %1

+        "+r"(width)        // %2

+      :

+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");

-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int width) {

-  asm volatile (

-    "dup        v1.4s, %w2                     \n"  // dither4

-  "1:                                          \n"

-    MEMACCESS(1)

-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "uqadd      v20.8b, v20.8b, v1.8b          \n"

-    "uqadd      v21.8b, v21.8b, v1.8b          \n"

-    "uqadd      v22.8b, v22.8b, v1.8b          \n"

-    ARGBTORGB565

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.

-    "b.gt       1b                             \n"

-  : "+r"(dst_rgb)    // %0

-  : "r"(src_argb),   // %1

-    "r"(dither4),    // %2

-    "r"(width)       // %3

-  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"

-  );

+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,

+                                uint8_t* dst_rgb,

+                                const uint32_t dither4,

+                                int width) {

+  asm volatile(

+      "dup        v1.4s, %w2                     \n"  // dither4

+      "1:                                        \n"

+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "uqadd      v20.8b, v20.8b, v1.8b          \n"

+      "uqadd      v21.8b, v21.8b, v1.8b          \n"

+      "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565

+      "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.

+      "b.gt       1b                             \n"

+      : "+r"(dst_rgb)   // %0

+      : "r"(src_argb),  // %1

+        "r"(dither4),   // %2

+        "r"(width)      // %3

+      : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");

-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,

+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,

+                            uint8_t* dst_argb1555,

                             int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    ARGBTOARGB1555

-    MEMACCESS(1)

-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb1555),  // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"

-  );

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      ARGBTOARGB1555

+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels

+                                                      // ARGB1555.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),      // %0

+        "+r"(dst_argb1555),  // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");

-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,

+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,

+                            uint8_t* dst_argb4444,

                             int width) {

-  asm volatile (

-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    ARGBTOARGB4444

-    MEMACCESS(1)

-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),      // %0

-    "+r"(dst_argb4444),  // %1

-    "+r"(width)            // %2

-  :

-  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"

-  );

+  asm volatile(

+      "movi       v4.16b, #0x0f                  \n"  // bits to clear with

+                                                      // vbic.

+      "1:                                        \n"

+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      ARGBTOARGB4444

+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels

+                                                      // ARGB4444.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),      // %0

+        "+r"(dst_argb4444),  // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");

-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

-    "movi       v7.8b, #16                     \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B

-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G

-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R

-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v7.8b            \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

-  );

+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

+      "movi       v7.8b, #16                     \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B

+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G

+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R

+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v7.8b            \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");

-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels

-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop

-    MEMACCESS(1)

-    "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_a),      // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,

+                              uint8_t* dst_a,

+                              int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16

+                                                                // pixels

+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop

+      "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_a),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

+      );

-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient

-    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient

-    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B

-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G

-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R

-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"

-  );

+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient

+      "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient

+      "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B

+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G

+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R

+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");

 // 8x1 pixels.

-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

+void ARGBToUV444Row_NEON(const uint8_t* src_argb,

+                         uint8_t* dst_u,

+                         uint8_t* dst_v,

                          int width) {

-  asm volatile (

-    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient

-    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient

-    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient

-    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient

-    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient

-    "movi       v29.16b,#0x80                  \n"  // 128.5

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B

-    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G

-    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R

-    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned

+  asm volatile(

+      "movi       v24.8b, #112                   \n"  // UB / VR 0.875

+                                                      // coefficient

+      "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient

+      "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient

+      "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient

+      "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient

+      "movi       v29.16b,#0x80                  \n"  // 128.5

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+                                                            // pixels.

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B

+      "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G

+      "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R

+      "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned

-    "umull      v3.8h, v2.8b, v24.8b           \n"  // R

-    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G

-    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B

-    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned

+      "umull      v3.8h, v2.8b, v24.8b           \n"  // R

+      "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G

+      "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B

+      "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned

-    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U

-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V

+      "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U

+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.

-    MEMACCESS(2)

-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",

-    "v24", "v25", "v26", "v27", "v28", "v29"

-  );

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.

+      "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_u),     // %1

+        "+r"(dst_v),     // %2

+        "+r"(width)      // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",

+        "v27", "v28", "v29");

-#define RGBTOUV_SETUP_REG                                                      \

-    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \

-    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \

-    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \

-    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \

-    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \

-    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */

+#define RGBTOUV_SETUP_REG                                                  \

+  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \

+  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \

+  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \

+  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \

+  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \

+  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */

-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.

-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,

-                         int width) {

-  asm volatile (

-    RGBTOUV_SETUP_REG

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.

-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(0)

-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.

-    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.

-    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.

-    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.

-    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.

-    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.

-    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.

-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average

-    "urshr      v1.8h, v1.8h, #1               \n"

-    "urshr      v2.8h, v2.8h, #1               \n"

-    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.

-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B

-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G

-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R

-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned

-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R

-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G

-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B

-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned

-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U

-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.

-    MEMACCESS(2)

-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_u),     // %1

-    "+r"(dst_v),     // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

-    "v20", "v21", "v22", "v23", "v24", "v25"

-  );

-}

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-#define RGBTOUV(QB, QG, QR) \

-    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \

-    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \

-    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \

-    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \

-    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \

-    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \

-    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \

-    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \

-    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \

-    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */

+// clang-format off

+#define RGBTOUV(QB, QG, QR)                                                 \

+  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \

+  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \

+  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \

+  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \

+  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \

+  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \

+  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \

+  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \

+  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \

+  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */

+// clang-format on

 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.

 // TODO(fbarchard): consider ptrdiff_t for all strides.

-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_argb_1 = src_argb + src_stride_argb;

+void ARGBToUVRow_NEON(const uint8_t* src_argb,

+                      int src_stride_argb,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;

   asm volatile (

     RGBTOUV_SETUP_REG

   "1:                                          \n"

-    MEMACCESS(0)

     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.

     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16

     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.

@@ -1486,9 +1415,7 @@

     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.

     RGBTOUV(v0.8h, v1.8h, v2.8h)

-    MEMACCESS(2)

     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1503,9 +1430,12 @@

 // TODO(fbarchard): Subsample match C code.

-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_argb_1 = src_argb + src_stride_argb;

+void ARGBToUVJRow_NEON(const uint8_t* src_argb,

+                       int src_stride_argb,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;

   asm volatile (

     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2

     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2

@@ -1514,12 +1444,10 @@

     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2

     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)

   "1:                                          \n"

-    MEMACCESS(0)

     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.

     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16

     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.

@@ -1531,9 +1459,7 @@

     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.

     RGBTOUV(v0.8h, v1.8h, v2.8h)

-    MEMACCESS(2)

     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

     "b.gt       1b                             \n"

   : "+r"(src_argb),  // %0

@@ -1547,18 +1473,19 @@

);

-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;

+void BGRAToUVRow_NEON(const uint8_t* src_bgra,

+                      int src_stride_bgra,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;

   asm volatile (

     RGBTOUV_SETUP_REG

   "1:                                          \n"

-    MEMACCESS(0)

     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.

     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.

     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more

     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.

@@ -1570,9 +1497,7 @@

     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.

     RGBTOUV(v0.8h, v1.8h, v2.8h)

-    MEMACCESS(2)

     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

     "b.gt       1b                             \n"

   : "+r"(src_bgra),  // %0

@@ -1586,18 +1511,19 @@

);

-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;

+void ABGRToUVRow_NEON(const uint8_t* src_abgr,

+                      int src_stride_abgr,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;

   asm volatile (

     RGBTOUV_SETUP_REG

   "1:                                          \n"

-    MEMACCESS(0)

     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.

     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.

     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.

@@ -1609,9 +1535,7 @@

     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.

     RGBTOUV(v0.8h, v2.8h, v1.8h)

-    MEMACCESS(2)

     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

     "b.gt       1b                             \n"

   : "+r"(src_abgr),  // %0

@@ -1625,18 +1549,19 @@

);

-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,

-                      uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;

+void RGBAToUVRow_NEON(const uint8_t* src_rgba,

+                      int src_stride_rgba,

+                      uint8_t* dst_u,

+                      uint8_t* dst_v,

+                      int width) {

+  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;

   asm volatile (

     RGBTOUV_SETUP_REG

   "1:                                          \n"

-    MEMACCESS(0)

     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.

     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.

     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.

     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.

@@ -1648,9 +1573,7 @@

     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.

     RGBTOUV(v0.8h, v1.8h, v2.8h)

-    MEMACCESS(2)

     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

     "b.gt       1b                             \n"

   : "+r"(src_rgba),  // %0

@@ -1664,18 +1587,19 @@

);

-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,

-                       uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;

+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,

+                       int src_stride_rgb24,

+                       uint8_t* dst_u,

+                       uint8_t* dst_v,

+                       int width) {

+  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;

   asm volatile (

     RGBTOUV_SETUP_REG

   "1:                                          \n"

-    MEMACCESS(0)

     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.

     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.

     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.

@@ -1687,9 +1611,7 @@

     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.

     RGBTOUV(v0.8h, v1.8h, v2.8h)

-    MEMACCESS(2)

     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

     "b.gt       1b                             \n"

   : "+r"(src_rgb24),  // %0

@@ -1703,18 +1625,19 @@

);

-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,

-                     uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_raw_1 = src_raw + src_stride_raw;

+void RAWToUVRow_NEON(const uint8_t* src_raw,

+                     int src_stride_raw,

+                     uint8_t* dst_u,

+                     uint8_t* dst_v,

+                     int width) {

+  const uint8_t* src_raw_1 = src_raw + src_stride_raw;

   asm volatile (

     RGBTOUV_SETUP_REG

   "1:                                          \n"

-    MEMACCESS(0)

     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.

     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.

-    MEMACCESS(1)

     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels

     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.

     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.

@@ -1726,9 +1649,7 @@

     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.

     RGBTOUV(v2.8h, v1.8h, v0.8h)

-    MEMACCESS(2)

     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

     "b.gt       1b                             \n"

   : "+r"(src_raw),  // %0

@@ -1743,699 +1664,656 @@

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,

-                        uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;

-  asm volatile (

-    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2

-    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2

-    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2

-    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2

-    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2

-    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.

-    RGB565TOARGB

-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.

-    RGB565TOARGB

-    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,

+                        int src_stride_rgb565,

+                        uint8_t* dst_u,

+                        uint8_t* dst_v,

+                        int width) {

+  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;

+  asm volatile(

+      "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) /

+                                                      // 2

+      "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2

+      "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2

+      "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2

+      "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2

+      "movi       v27.16b, #0x80                 \n"  // 128.5 0x8080 in 16bit

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.

+      RGB565TOARGB

+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.

+      RGB565TOARGB

+      "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.

-    RGB565TOARGB

-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.

-    RGB565TOARGB

-    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.

+      RGB565TOARGB

+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.

+      RGB565TOARGB

+      "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    "ins        v16.D[1], v17.D[0]             \n"

-    "ins        v18.D[1], v19.D[0]             \n"

-    "ins        v20.D[1], v21.D[0]             \n"

+      "ins        v16.D[1], v17.D[0]             \n"

+      "ins        v18.D[1], v19.D[0]             \n"

+      "ins        v20.D[1], v21.D[0]             \n"

-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average

-    "urshr      v5.8h, v18.8h, #1              \n"

-    "urshr      v6.8h, v20.8h, #1              \n"

+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average

+      "urshr      v5.8h, v18.8h, #1              \n"

+      "urshr      v6.8h, v20.8h, #1              \n"

-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.

-    "mul        v16.8h, v4.8h, v22.8h          \n"  // B

-    "mls        v16.8h, v5.8h, v23.8h          \n"  // G

-    "mls        v16.8h, v6.8h, v24.8h          \n"  // R

-    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned

-    "mul        v17.8h, v6.8h, v22.8h          \n"  // R

-    "mls        v17.8h, v5.8h, v26.8h          \n"  // G

-    "mls        v17.8h, v4.8h, v25.8h          \n"  // B

-    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned

-    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U

-    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V

-    MEMACCESS(2)

-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

-    "b.gt       1b                             \n"

-  : "+r"(src_rgb565),  // %0

-    "+r"(src_rgb565_1),  // %1

-    "+r"(dst_u),     // %2

-    "+r"(dst_v),     // %3

-    "+r"(width)        // %4

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",

-    "v25", "v26", "v27"

-  );

+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.

+      "mul        v16.8h, v4.8h, v22.8h          \n"  // B

+      "mls        v16.8h, v5.8h, v23.8h          \n"  // G

+      "mls        v16.8h, v6.8h, v24.8h          \n"  // R

+      "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned

+      "mul        v17.8h, v6.8h, v22.8h          \n"  // R

+      "mls        v17.8h, v5.8h, v26.8h          \n"  // G

+      "mls        v17.8h, v4.8h, v25.8h          \n"  // B

+      "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned

+      "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U

+      "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V

+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

+      "b.gt       1b                             \n"

+      : "+r"(src_rgb565),    // %0

+        "+r"(src_rgb565_1),  // %1

+        "+r"(dst_u),         // %2

+        "+r"(dst_v),         // %3

+        "+r"(width)          // %4

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",

+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",

+        "v27");

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,

-                        uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;

-  asm volatile (

-    RGBTOUV_SETUP_REG

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,

+                          int src_stride_argb1555,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width) {

+  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;

+  asm volatile(

+      RGBTOUV_SETUP_REG

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.

-    RGB555TOARGB

-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.

+      RGB555TOARGB

+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    "ins        v16.D[1], v26.D[0]             \n"

-    "ins        v17.D[1], v27.D[0]             \n"

-    "ins        v18.D[1], v28.D[0]             \n"

+      "ins        v16.D[1], v26.D[0]             \n"

+      "ins        v17.D[1], v27.D[0]             \n"

+      "ins        v18.D[1], v28.D[0]             \n"

-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average

-    "urshr      v5.8h, v17.8h, #1              \n"

-    "urshr      v6.8h, v18.8h, #1              \n"

+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average

+      "urshr      v5.8h, v17.8h, #1              \n"

+      "urshr      v6.8h, v18.8h, #1              \n"

-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.

-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B

-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G

-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R

-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned

-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R

-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G

-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B

-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned

-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U

-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V

-    MEMACCESS(2)

-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb1555),  // %0

-    "+r"(src_argb1555_1),  // %1

-    "+r"(dst_u),     // %2

-    "+r"(dst_v),     // %3

-    "+r"(width)        // %4

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",

-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",

-    "v26", "v27", "v28"

-  );

+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.

+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B

+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G

+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R

+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned

+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R

+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G

+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B

+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned

+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U

+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V

+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb1555),    // %0

+        "+r"(src_argb1555_1),  // %1

+        "+r"(dst_u),           // %2

+        "+r"(dst_v),           // %3

+        "+r"(width)            // %4

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",

+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",

+        "v28");

 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.

-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,

-                          uint8* dst_u, uint8* dst_v, int width) {

-  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;

-  asm volatile (

-    RGBTOUV_SETUP_REG

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,

+                          int src_stride_argb4444,

+                          uint8_t* dst_u,

+                          uint8_t* dst_v,

+                          int width) {

+  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;

+  asm volatile(

+      RGBTOUV_SETUP_REG

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.

-    ARGB4444TOARGB

-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.

+      ARGB4444TOARGB

+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.

+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.

+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.

-    "ins        v16.D[1], v26.D[0]             \n"

-    "ins        v17.D[1], v27.D[0]             \n"

-    "ins        v18.D[1], v28.D[0]             \n"

+      "ins        v16.D[1], v26.D[0]             \n"

+      "ins        v17.D[1], v27.D[0]             \n"

+      "ins        v18.D[1], v28.D[0]             \n"

-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average

-    "urshr      v5.8h, v17.8h, #1              \n"

-    "urshr      v6.8h, v18.8h, #1              \n"

+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average

+      "urshr      v5.8h, v17.8h, #1              \n"

+      "urshr      v6.8h, v18.8h, #1              \n"

-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.

-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B

-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G

-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R

-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned

-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R

-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G

-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B

-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned

-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U

-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V

-    MEMACCESS(2)

-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

-    MEMACCESS(3)

-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb4444),  // %0

-    "+r"(src_argb4444_1),  // %1

-    "+r"(dst_u),     // %2

-    "+r"(dst_v),     // %3

-    "+r"(width)        // %4

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",

-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",

-    "v26", "v27", "v28"

+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.

+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B

+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G

+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R

+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned

+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R

+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G

+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B

+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned

+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U

+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V

+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.

+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb4444),    // %0

+        "+r"(src_argb4444_1),  // %1

+        "+r"(dst_u),           // %2

+        "+r"(dst_v),           // %3

+        "+r"(width)            // %4

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",

+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",

+        "v28"

-  );

+      );

-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient

-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient

-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient

-    "movi       v27.8b, #16                    \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    RGB565TOARGB

-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B

-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G

-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R

-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v27.8b           \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_rgb565),  // %0

-    "+r"(dst_y),       // %1

-    "+r"(width)          // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",

-    "v24", "v25", "v26", "v27"

-  );

+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient

+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient

+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient

+      "movi       v27.8b, #16                    \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      RGB565TOARGB

+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B

+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G

+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R

+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v27.8b           \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_rgb565),  // %0

+        "+r"(dst_y),       // %1

+        "+r"(width)        // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",

+        "v27");

-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

-    "movi       v7.8b, #16                     \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    ARGB1555TOARGB

-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B

-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G

-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R

-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v7.8b            \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb1555),  // %0

-    "+r"(dst_y),         // %1

-    "+r"(width)            // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

-  );

+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,

+                         uint8_t* dst_y,

+                         int width) {

+  asm volatile(

+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

+      "movi       v7.8b, #16                     \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      ARGB1555TOARGB

+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B

+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G

+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R

+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v7.8b            \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb1555),  // %0

+        "+r"(dst_y),         // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");

-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient

-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient

-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient

-    "movi       v27.8b, #16                    \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    ARGB4444TOARGB

-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B

-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G

-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R

-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v27.8b           \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb4444),  // %0

-    "+r"(dst_y),         // %1

-    "+r"(width)            // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"

-  );

+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,

+                         uint8_t* dst_y,

+                         int width) {

+  asm volatile(

+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient

+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient

+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient

+      "movi       v27.8b, #16                    \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      ARGB4444TOARGB

+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B

+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G

+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R

+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v27.8b           \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb4444),  // %0

+        "+r"(dst_y),         // %1

+        "+r"(width)          // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");

-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient

-    "movi       v7.8b, #16                     \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v16.8h, v1.8b, v4.8b           \n"  // R

-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G

-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B

-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v7.8b            \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_bgra),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

-  );

+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient

+      "movi       v7.8b, #16                     \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v16.8h, v1.8b, v4.8b           \n"  // R

+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G

+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // B

+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v7.8b            \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_bgra),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");

-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient

-    "movi       v7.8b, #16                     \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v16.8h, v0.8b, v4.8b           \n"  // R

-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G

-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B

-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v7.8b            \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_abgr),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

-  );

+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient

+      "movi       v7.8b, #16                     \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v16.8h, v0.8b, v4.8b           \n"  // R

+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G

+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // B

+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v7.8b            \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_abgr),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");

-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

-    "movi       v7.8b, #16                     \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v16.8h, v1.8b, v4.8b           \n"  // B

-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G

-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R

-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v7.8b            \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_rgba),  // %0

-    "+r"(dst_y),     // %1

-    "+r"(width)        // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

-  );

+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

+      "movi       v7.8b, #16                     \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v16.8h, v1.8b, v4.8b           \n"  // B

+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G

+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // R

+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v7.8b            \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_rgba),  // %0

+        "+r"(dst_y),     // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");

-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

-    "movi       v7.8b, #16                     \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B

-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G

-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R

-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v7.8b            \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_rgb24),  // %0

-    "+r"(dst_y),      // %1

-    "+r"(width)         // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

-  );

+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient

+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient

+      "movi       v7.8b, #16                     \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B

+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G

+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R

+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v7.8b            \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_rgb24),  // %0

+        "+r"(dst_y),      // %1

+        "+r"(width)       // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");

-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {

-  asm volatile (

-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient

-    "movi       v7.8b, #16                     \n"  // Add 16 constant

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B

-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G

-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R

-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

-    "uqadd      v0.8b, v0.8b, v7.8b            \n"

-    MEMACCESS(1)

-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

-    "b.gt       1b                             \n"

-  : "+r"(src_raw),  // %0

-    "+r"(dst_y),    // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

-  );

+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {

+  asm volatile(

+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient

+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient

+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient

+      "movi       v7.8b, #16                     \n"  // Add 16 constant

+      "1:                                        \n"

+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B

+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G

+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R

+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y

+      "uqadd      v0.8b, v0.8b, v7.8b            \n"

+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.

+      "b.gt       1b                             \n"

+      : "+r"(src_raw),  // %0

+        "+r"(dst_y),    // %1

+        "+r"(width)     // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");

 // Bilinear filter 16x2 -> 16x1

-void InterpolateRow_NEON(uint8* dst_ptr,

-                         const uint8* src_ptr, ptrdiff_t src_stride,

-                         int dst_width, int source_y_fraction) {

+void InterpolateRow_NEON(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         int dst_width,

+                         int source_y_fraction) {

   int y1_fraction = source_y_fraction;

   int y0_fraction = 256 - y1_fraction;

-  const uint8* src_ptr1 = src_ptr + src_stride;

-  asm volatile (

-    "cmp        %w4, #0                        \n"

-    "b.eq       100f                           \n"

-    "cmp        %w4, #128                      \n"

-    "b.eq       50f                            \n"

+  const uint8_t* src_ptr1 = src_ptr + src_stride;

+  asm volatile(

+      "cmp        %w4, #0                        \n"

+      "b.eq       100f                           \n"

+      "cmp        %w4, #128                      \n"

+      "b.eq       50f                            \n"

-    "dup        v5.16b, %w4                    \n"

-    "dup        v4.16b, %w5                    \n"

-    // General purpose row blend.

-  "1:                                          \n"

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"

-    MEMACCESS(2)

-    "ld1        {v1.16b}, [%2], #16            \n"

-    "subs       %w3, %w3, #16                  \n"

-    "umull      v2.8h, v0.8b,  v4.8b           \n"

-    "umull2     v3.8h, v0.16b, v4.16b          \n"

-    "umlal      v2.8h, v1.8b,  v5.8b           \n"

-    "umlal2     v3.8h, v1.16b, v5.16b          \n"

-    "rshrn      v0.8b,  v2.8h, #8              \n"

-    "rshrn2     v0.16b, v3.8h, #8              \n"

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"

-    "b.gt       1b                             \n"

-    "b          99f                            \n"

+      "dup        v5.16b, %w4                    \n"

+      "dup        v4.16b, %w5                    \n"

+      // General purpose row blend.

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%1], #16            \n"

+      "ld1        {v1.16b}, [%2], #16            \n"

+      "subs       %w3, %w3, #16                  \n"

+      "umull      v2.8h, v0.8b,  v4.8b           \n"

+      "umull2     v3.8h, v0.16b, v4.16b          \n"

+      "umlal      v2.8h, v1.8b,  v5.8b           \n"

+      "umlal2     v3.8h, v1.16b, v5.16b          \n"

+      "rshrn      v0.8b,  v2.8h, #8              \n"

+      "rshrn2     v0.16b, v3.8h, #8              \n"

+      "st1        {v0.16b}, [%0], #16            \n"

+      "b.gt       1b                             \n"

+      "b          99f                            \n"

-    // Blend 50 / 50.

-  "50:                                         \n"

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"

-    MEMACCESS(2)

-    "ld1        {v1.16b}, [%2], #16            \n"

-    "subs       %w3, %w3, #16                  \n"

-    "urhadd     v0.16b, v0.16b, v1.16b         \n"

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"

-    "b.gt       50b                            \n"

-    "b          99f                            \n"

+      // Blend 50 / 50.

+      "50:                                       \n"

+      "ld1        {v0.16b}, [%1], #16            \n"

+      "ld1        {v1.16b}, [%2], #16            \n"

+      "subs       %w3, %w3, #16                  \n"

+      "urhadd     v0.16b, v0.16b, v1.16b         \n"

+      "st1        {v0.16b}, [%0], #16            \n"

+      "b.gt       50b                            \n"

+      "b          99f                            \n"

-    // Blend 100 / 0 - Copy row unchanged.

-  "100:                                        \n"

-    MEMACCESS(1)

-    "ld1        {v0.16b}, [%1], #16            \n"

-    "subs       %w3, %w3, #16                  \n"

-    MEMACCESS(0)

-    "st1        {v0.16b}, [%0], #16            \n"

-    "b.gt       100b                           \n"

+      // Blend 100 / 0 - Copy row unchanged.

+      "100:                                      \n"

+      "ld1        {v0.16b}, [%1], #16            \n"

+      "subs       %w3, %w3, #16                  \n"

+      "st1        {v0.16b}, [%0], #16            \n"

+      "b.gt       100b                           \n"

-  "99:                                         \n"

-  : "+r"(dst_ptr),          // %0

-    "+r"(src_ptr),          // %1

-    "+r"(src_ptr1),         // %2

-    "+r"(dst_width),        // %3

-    "+r"(y1_fraction),      // %4

-    "+r"(y0_fraction)       // %5

-  :

-  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"

-  );

+      "99:                                       \n"

+      : "+r"(dst_ptr),      // %0

+        "+r"(src_ptr),      // %1

+        "+r"(src_ptr1),     // %2

+        "+r"(dst_width),    // %3

+        "+r"(y1_fraction),  // %4

+        "+r"(y0_fraction)   // %5

+      :

+      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");

 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr

-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                       uint8* dst_argb, int width) {

-  asm volatile (

-    "subs       %w3, %w3, #8                   \n"

-    "b.lt       89f                            \n"

-    // Blend 8 pixels.

-  "8:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels

-    MEMACCESS(1)

-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a

-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a

-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a

-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8

-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8

-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8

-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)

-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)

-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)

-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb

-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg

-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr

-    "movi       v3.8b, #255                    \n"  // a = 255

-    MEMACCESS(2)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels

-    "b.ge       8b                             \n"

+void ARGBBlendRow_NEON(const uint8_t* src_argb0,

+                       const uint8_t* src_argb1,

+                       uint8_t* dst_argb,

+                       int width) {

+  asm volatile(

+      "subs       %w3, %w3, #8                   \n"

+      "b.lt       89f                            \n"

+      // Blend 8 pixels.

+      "8:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0

+                                                            // pixels

+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1

+                                                            // pixels

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a

+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a

+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a

+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8

+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8

+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8

+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)

+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)

+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)

+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb

+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg

+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr

+      "movi       v3.8b, #255                    \n"  // a = 255

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB

+                                                            // pixels

+      "b.ge       8b                             \n"

-  "89:                                         \n"

-    "adds       %w3, %w3, #8-1                 \n"

-    "b.lt       99f                            \n"

+      "89:                                       \n"

+      "adds       %w3, %w3, #8-1                 \n"

+      "b.lt       99f                            \n"

-    // Blend 1 pixels.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.

-    MEMACCESS(1)

-    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.

-    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.

-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a

-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a

-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a

-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8

-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8

-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8

-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)

-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)

-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)

-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb

-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg

-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr

-    "movi       v3.8b, #255                    \n"  // a = 255

-    MEMACCESS(2)

-    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.

-    "b.ge       1b                             \n"

+      // Blend 1 pixels.

+      "1:                                        \n"

+      "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.

+      "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.

+      "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.

+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a

+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a

+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a

+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8

+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8

+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8

+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)

+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)

+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)

+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb

+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg

+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr

+      "movi       v3.8b, #255                    \n"  // a = 255

+      "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.

+      "b.ge       1b                             \n"

-  "99:                                         \n"

+      "99:                                       \n"

-  : "+r"(src_argb0),    // %0

-    "+r"(src_argb1),    // %1

-    "+r"(dst_argb),     // %2

-    "+r"(width)         // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

-    "v16", "v17", "v18"

-  );

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",

+        "v17", "v18");

 // Attenuate 8 pixels at a time.

-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

-  asm volatile (

-    // Attenuate 8 pixels.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a

-    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a

-    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a

-    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8

-    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8

-    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8

-    MEMACCESS(1)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_argb),   // %1

-    "+r"(width)       // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"

-  );

+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,

+                           uint8_t* dst_argb,

+                           int width) {

+  asm volatile(

+      // Attenuate 8 pixels.

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a

+      "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a

+      "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a

+      "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8

+      "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8

+      "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB

+                                                            // pixels

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");

 // Quantize 8 ARGB pixels (32 bytes).

 // dst = (dst * scale >> 16) * interval_size + interval_offset;

-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,

-                          int interval_offset, int width) {

-  asm volatile (

-    "dup        v4.8h, %w2                     \n"

-    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1

-    "dup        v5.8h, %w3                     \n"  // interval multiply.

-    "dup        v6.8h, %w4                     \n"  // interval add

+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,

+                          int scale,

+                          int interval_size,

+                          int interval_offset,

+                          int width) {

+  asm volatile(

+      "dup        v4.8h, %w2                     \n"

+      "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1

+      "dup        v5.8h, %w3                     \n"  // interval multiply.

+      "dup        v6.8h, %w4                     \n"  // interval add

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.

-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.

-    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)

-    "uxtl       v1.8h, v1.8b                   \n"

-    "uxtl       v2.8h, v2.8b                   \n"

-    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale

-    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g

-    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r

-    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size

-    "mul        v1.8h, v1.8h, v5.8h            \n"  // g

-    "mul        v2.8h, v2.8h, v5.8h            \n"  // r

-    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset

-    "add        v1.8h, v1.8h, v6.8h            \n"  // g

-    "add        v2.8h, v2.8h, v6.8h            \n"  // r

-    "uqxtn      v0.8b, v0.8h                   \n"

-    "uqxtn      v1.8b, v1.8h                   \n"

-    "uqxtn      v2.8b, v2.8h                   \n"

-    MEMACCESS(0)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(dst_argb),       // %0

-    "+r"(width)           // %1

-  : "r"(scale),           // %2

-    "r"(interval_size),   // %3

-    "r"(interval_offset)  // %4

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"

-  );

+      // 8 pixel loop.

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8  ARGB.

+      "subs       %w1, %w1, #8                   \n"    // 8 processed per loop.

+      "uxtl       v0.8h, v0.8b                   \n"    // b (0 .. 255)

+      "uxtl       v1.8h, v1.8b                   \n"

+      "uxtl       v2.8h, v2.8b                   \n"

+      "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale

+      "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g

+      "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r

+      "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size

+      "mul        v1.8h, v1.8h, v5.8h            \n"  // g

+      "mul        v2.8h, v2.8h, v5.8h            \n"  // r

+      "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset

+      "add        v1.8h, v1.8h, v6.8h            \n"  // g

+      "add        v2.8h, v2.8h, v6.8h            \n"  // r

+      "uqxtn      v0.8b, v0.8h                   \n"

+      "uqxtn      v1.8b, v1.8h                   \n"

+      "uqxtn      v2.8b, v2.8h                   \n"

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(dst_argb),       // %0

+        "+r"(width)           // %1

+      : "r"(scale),           // %2

+        "r"(interval_size),   // %3

+        "r"(interval_offset)  // %4

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");

 // Shade 8 pixels at a time by specified value.

 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.

 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.

-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,

-                       uint32 value) {

-  asm volatile (

-    "dup        v0.4s, %w3                     \n"  // duplicate scale value.

-    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.

-    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.

+void ARGBShadeRow_NEON(const uint8_t* src_argb,

+                       uint8_t* dst_argb,

+                       int width,

+                       uint32_t value) {

+  asm volatile(

+      "dup        v0.4s, %w3                     \n"  // duplicate scale value.

+      "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.

+      "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)

-    "uxtl       v5.8h, v5.8b                   \n"

-    "uxtl       v6.8h, v6.8b                   \n"

-    "uxtl       v7.8h, v7.8b                   \n"

-    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2

-    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g

-    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r

-    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a

-    "uqxtn      v4.8b, v4.8h                   \n"

-    "uqxtn      v5.8b, v5.8h                   \n"

-    "uqxtn      v6.8b, v6.8h                   \n"

-    "uqxtn      v7.8b, v7.8h                   \n"

-    MEMACCESS(1)

-    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),       // %0

-    "+r"(dst_argb),       // %1

-    "+r"(width)           // %2

-  : "r"(value)            // %3

-  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"

-  );

+      // 8 pixel loop.

+      "1:                                        \n"

+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)

+      "uxtl       v5.8h, v5.8b                   \n"

+      "uxtl       v6.8h, v6.8b                   \n"

+      "uxtl       v7.8h, v7.8b                   \n"

+      "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2

+      "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g

+      "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r

+      "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a

+      "uqxtn      v4.8b, v4.8h                   \n"

+      "uqxtn      v5.8b, v5.8h                   \n"

+      "uqxtn      v6.8b, v6.8h                   \n"

+      "uqxtn      v7.8b, v7.8h                   \n"

+      "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      : "r"(value)       // %3

+      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");

 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels

 // Similar to ARGBToYJ but stores ARGB.

 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;

-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {

-  asm volatile (

-    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient

-    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient

-    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B

-    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G

-    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R

-    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B

-    "orr        v1.8b, v0.8b, v0.8b            \n"  // G

-    "orr        v2.8b, v0.8b, v0.8b            \n"  // R

-    MEMACCESS(1)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(width)      // %2

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"

-  );

+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {

+  asm volatile(

+      "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient

+      "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient

+      "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B

+      "umlal      v4.8h, v1.8b, v25.8b           \n"  // G

+      "umlal      v4.8h, v2.8b, v26.8b           \n"  // R

+      "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B

+      "orr        v1.8b, v0.8b, v0.8b            \n"  // G

+      "orr        v2.8b, v0.8b, v0.8b            \n"  // R

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(width)      // %2

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");

 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

@@ -2443,194 +2321,180 @@

 //    g = (r * 45 + g * 88 + b * 22) >> 7

 //    r = (r * 50 + g * 98 + b * 24) >> 7

-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {

-  asm volatile (

-    "movi       v20.8b, #17                    \n"  // BB coefficient

-    "movi       v21.8b, #68                    \n"  // BG coefficient

-    "movi       v22.8b, #35                    \n"  // BR coefficient

-    "movi       v24.8b, #22                    \n"  // GB coefficient

-    "movi       v25.8b, #88                    \n"  // GG coefficient

-    "movi       v26.8b, #45                    \n"  // GR coefficient

-    "movi       v28.8b, #24                    \n"  // BB coefficient

-    "movi       v29.8b, #98                    \n"  // BG coefficient

-    "movi       v30.8b, #50                    \n"  // BR coefficient

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.

-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.

-    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B

-    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G

-    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R

-    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G

-    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G

-    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R

-    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R

-    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G

-    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R

-    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B

-    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G

-    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R

-    MEMACCESS(0)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.

-    "b.gt       1b                             \n"

-  : "+r"(dst_argb),  // %0

-    "+r"(width)      // %1

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",

-    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"

-  );

+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {

+  asm volatile(

+      "movi       v20.8b, #17                    \n"  // BB coefficient

+      "movi       v21.8b, #68                    \n"  // BG coefficient

+      "movi       v22.8b, #35                    \n"  // BR coefficient

+      "movi       v24.8b, #22                    \n"  // GB coefficient

+      "movi       v25.8b, #88                    \n"  // GG coefficient

+      "movi       v26.8b, #45                    \n"  // GR coefficient

+      "movi       v28.8b, #24                    \n"  // BB coefficient

+      "movi       v29.8b, #98                    \n"  // BG coefficient

+      "movi       v30.8b, #50                    \n"  // BR coefficient

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.

+      "subs       %w1, %w1, #8                   \n"   // 8 processed per loop.

+      "umull      v4.8h, v0.8b, v20.8b           \n"   // B to Sepia B

+      "umlal      v4.8h, v1.8b, v21.8b           \n"   // G

+      "umlal      v4.8h, v2.8b, v22.8b           \n"   // R

+      "umull      v5.8h, v0.8b, v24.8b           \n"   // B to Sepia G

+      "umlal      v5.8h, v1.8b, v25.8b           \n"   // G

+      "umlal      v5.8h, v2.8b, v26.8b           \n"   // R

+      "umull      v6.8h, v0.8b, v28.8b           \n"   // B to Sepia R

+      "umlal      v6.8h, v1.8b, v29.8b           \n"   // G

+      "umlal      v6.8h, v2.8b, v30.8b           \n"   // R

+      "uqshrn     v0.8b, v4.8h, #7               \n"   // 16 bit to 8 bit B

+      "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G

+      "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.

+      "b.gt       1b                             \n"

+      : "+r"(dst_argb),  // %0

+        "+r"(width)      // %1

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",

+        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");

 // Tranform 8 ARGB pixels (32 bytes) with color matrix.

 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function

 // needs to saturate.  Consider doing a non-saturating version.

-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,

-                             const int8* matrix_argb, int width) {

-  asm volatile (

-    MEMACCESS(3)

-    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.

-    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.

-    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.

+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,

+                             uint8_t* dst_argb,

+                             const int8_t* matrix_argb,

+                             int width) {

+  asm volatile(

+      "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.

+      "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.

+      "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit

-    "uxtl       v17.8h, v17.8b                 \n"  // g

-    "uxtl       v18.8h, v18.8b                 \n"  // r

-    "uxtl       v19.8h, v19.8b                 \n"  // a

-    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B

-    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G

-    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R

-    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A

-    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B

-    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G

-    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R

-    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A

-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B

-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G

-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R

-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A

-    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B

-    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G

-    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R

-    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A

-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B

-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G

-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R

-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A

-    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B

-    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G

-    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R

-    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A

-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B

-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G

-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R

-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A

-    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B

-    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G

-    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R

-    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A

-    MEMACCESS(1)

-    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_argb),   // %1

-    "+r"(width)       // %2

-  : "r"(matrix_argb)  // %3

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",

-    "v18", "v19", "v22", "v23", "v24", "v25"

-  );

+      "1:                                        \n"

+      "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

+      "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit

+      "uxtl       v17.8h, v17.8b                 \n"  // g

+      "uxtl       v18.8h, v18.8b                 \n"  // r

+      "uxtl       v19.8h, v19.8b                 \n"  // a

+      "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B

+      "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G

+      "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R

+      "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A

+      "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B

+      "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G

+      "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R

+      "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A

+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B

+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G

+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R

+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A

+      "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B

+      "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G

+      "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R

+      "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A

+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B

+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G

+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R

+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A

+      "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B

+      "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G

+      "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R

+      "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A

+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B

+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G

+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R

+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A

+      "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B

+      "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G

+      "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R

+      "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A

+      "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),   // %0

+        "+r"(dst_argb),   // %1

+        "+r"(width)       // %2

+      : "r"(matrix_argb)  // %3

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",

+        "v17", "v18", "v19", "v22", "v23", "v24", "v25");

 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.

 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    MEMACCESS(1)

-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B

-    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G

-    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R

-    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A

-    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B

-    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G

-    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R

-    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A

-    MEMACCESS(2)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

-  );

+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      // 8 pixel loop.

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B

+      "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G

+      "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R

+      "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A

+      "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B

+      "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G

+      "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R

+      "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");

 // Add 2 rows of ARGB pixels together, 8 pixels at a time.

-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    MEMACCESS(1)

-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "uqadd      v0.8b, v0.8b, v4.8b            \n"

-    "uqadd      v1.8b, v1.8b, v5.8b            \n"

-    "uqadd      v2.8b, v2.8b, v6.8b            \n"

-    "uqadd      v3.8b, v3.8b, v7.8b            \n"

-    MEMACCESS(2)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

-  );

+void ARGBAddRow_NEON(const uint8_t* src_argb0,

+                     const uint8_t* src_argb1,

+                     uint8_t* dst_argb,

+                     int width) {

+  asm volatile(

+      // 8 pixel loop.

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "uqadd      v0.8b, v0.8b, v4.8b            \n"

+      "uqadd      v1.8b, v1.8b, v5.8b            \n"

+      "uqadd      v2.8b, v2.8b, v6.8b            \n"

+      "uqadd      v3.8b, v3.8b, v7.8b            \n"

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");

 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.

-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

-  asm volatile (

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.

-    MEMACCESS(1)

-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "uqsub      v0.8b, v0.8b, v4.8b            \n"

-    "uqsub      v1.8b, v1.8b, v5.8b            \n"

-    "uqsub      v2.8b, v2.8b, v6.8b            \n"

-    "uqsub      v3.8b, v3.8b, v7.8b            \n"

-    MEMACCESS(2)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_argb0),  // %0

-    "+r"(src_argb1),  // %1

-    "+r"(dst_argb),   // %2

-    "+r"(width)       // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"

-  );

+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,

+                          const uint8_t* src_argb1,

+                          uint8_t* dst_argb,

+                          int width) {

+  asm volatile(

+      // 8 pixel loop.

+      "1:                                        \n"

+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB

+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "uqsub      v0.8b, v0.8b, v4.8b            \n"

+      "uqsub      v1.8b, v1.8b, v5.8b            \n"

+      "uqsub      v2.8b, v2.8b, v6.8b            \n"

+      "uqsub      v3.8b, v3.8b, v7.8b            \n"

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_argb0),  // %0

+        "+r"(src_argb1),  // %1

+        "+r"(dst_argb),   // %2

+        "+r"(width)       // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");

 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.

@@ -2638,54 +2502,50 @@

 // R = Sobel

 // G = Sobel

 // B = Sobel

-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    "movi       v3.8b, #255                    \n"  // alpha

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.

-    MEMACCESS(1)

-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add

-    "orr        v1.8b, v0.8b, v0.8b            \n"

-    "orr        v2.8b, v0.8b, v0.8b            \n"

-    MEMACCESS(2)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"

-  );

+void SobelRow_NEON(const uint8_t* src_sobelx,

+                   const uint8_t* src_sobely,

+                   uint8_t* dst_argb,

+                   int width) {

+  asm volatile(

+      "movi       v3.8b, #255                    \n"  // alpha

+      // 8 pixel loop.

+      "1:                                        \n"

+      "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.

+      "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add

+      "orr        v1.8b, v0.8b, v0.8b            \n"

+      "orr        v2.8b, v0.8b, v0.8b            \n"

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_argb),    // %2

+        "+r"(width)        // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3");

 // Adds Sobel X and Sobel Y and stores Sobel into plane.

-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                          uint8* dst_y, int width) {

-  asm volatile (

-    // 16 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.

-    MEMACCESS(1)

-    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.

-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.

-    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add

-    MEMACCESS(2)

-    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.

-    "b.gt       1b                             \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_y),       // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "v0", "v1"

-  );

+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,

+                          const uint8_t* src_sobely,

+                          uint8_t* dst_y,

+                          int width) {

+  asm volatile(

+      // 16 pixel loop.

+      "1:                                        \n"

+      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.

+      "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.

+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.

+      "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add

+      "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.

+      "b.gt       1b                             \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_y),       // %2

+        "+r"(width)        // %3

+      :

+      : "cc", "memory", "v0", "v1");

 // Mixes Sobel X, Sobel Y and Sobel into ARGB.

@@ -2693,28 +2553,26 @@

 // R = Sobel X

 // G = Sobel

 // B = Sobel Y

-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width) {

-  asm volatile (

-    "movi       v3.8b, #255                    \n"  // alpha

-    // 8 pixel loop.

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.

-    MEMACCESS(1)

-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add

-    MEMACCESS(2)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_sobelx),  // %0

-    "+r"(src_sobely),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(width)        // %3

-  :

-  : "cc", "memory", "v0", "v1", "v2", "v3"

-  );

+void SobelXYRow_NEON(const uint8_t* src_sobelx,

+                     const uint8_t* src_sobely,

+                     uint8_t* dst_argb,

+                     int width) {

+  asm volatile(

+      "movi       v3.8b, #255                    \n"  // alpha

+      // 8 pixel loop.

+      "1:                                        \n"

+      "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.

+      "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB

+      "b.gt       1b                             \n"

+      : "+r"(src_sobelx),  // %0

+        "+r"(src_sobely),  // %1

+        "+r"(dst_argb),    // %2

+        "+r"(width)        // %3

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v3");

 // SobelX as a matrix is

@@ -2721,43 +2579,39 @@

 // -1  0  1

 // -2  0  2

 // -1  0  1

-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,

-                    const uint8* src_y2, uint8* dst_sobelx, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.8b}, [%0],%5               \n"  // top

-    MEMACCESS(0)

-    "ld1        {v1.8b}, [%0],%6               \n"

-    "usubl      v0.8h, v0.8b, v1.8b            \n"

-    MEMACCESS(1)

-    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2

-    MEMACCESS(1)

-    "ld1        {v3.8b}, [%1],%6               \n"

-    "usubl      v1.8h, v2.8b, v3.8b            \n"

-    "add        v0.8h, v0.8h, v1.8h            \n"

-    "add        v0.8h, v0.8h, v1.8h            \n"

-    MEMACCESS(2)

-    "ld1        {v2.8b}, [%2],%5               \n"  // bottom

-    MEMACCESS(2)

-    "ld1        {v3.8b}, [%2],%6               \n"

-    "subs       %w4, %w4, #8                   \n"  // 8 pixels

-    "usubl      v1.8h, v2.8b, v3.8b            \n"

-    "add        v0.8h, v0.8h, v1.8h            \n"

-    "abs        v0.8h, v0.8h                   \n"

-    "uqxtn      v0.8b, v0.8h                   \n"

-    MEMACCESS(3)

-    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx

-    "b.gt       1b                             \n"

-  : "+r"(src_y0),      // %0

-    "+r"(src_y1),      // %1

-    "+r"(src_y2),      // %2

-    "+r"(dst_sobelx),  // %3

-    "+r"(width)        // %4

-  : "r"(2LL),          // %5

-    "r"(6LL)           // %6

-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+void SobelXRow_NEON(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    const uint8_t* src_y2,

+                    uint8_t* dst_sobelx,

+                    int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v0.8b}, [%0],%5               \n"  // top

+      "ld1        {v1.8b}, [%0],%6               \n"

+      "usubl      v0.8h, v0.8b, v1.8b            \n"

+      "ld1        {v2.8b}, [%1],%5               \n"  // center * 2

+      "ld1        {v3.8b}, [%1],%6               \n"

+      "usubl      v1.8h, v2.8b, v3.8b            \n"

+      "add        v0.8h, v0.8h, v1.8h            \n"

+      "add        v0.8h, v0.8h, v1.8h            \n"

+      "ld1        {v2.8b}, [%2],%5               \n"  // bottom

+      "ld1        {v3.8b}, [%2],%6               \n"

+      "subs       %w4, %w4, #8                   \n"  // 8 pixels

+      "usubl      v1.8h, v2.8b, v3.8b            \n"

+      "add        v0.8h, v0.8h, v1.8h            \n"

+      "abs        v0.8h, v0.8h                   \n"

+      "uqxtn      v0.8b, v0.8h                   \n"

+      "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx

+      "b.gt       1b                             \n"

+      : "+r"(src_y0),                           // %0

+        "+r"(src_y1),                           // %1

+        "+r"(src_y2),                           // %2

+        "+r"(dst_sobelx),                       // %3

+        "+r"(width)                             // %4

+      : "r"(2LL),                               // %5

+        "r"(6LL)                                // %6

+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

+      );

 // SobelY as a matrix is

@@ -2764,43 +2618,264 @@

 // -1 -2 -1

 //  0  0  0

 //  1  2  1

-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,

-                    uint8* dst_sobely, int width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.8b}, [%0],%4               \n"  // left

-    MEMACCESS(1)

-    "ld1        {v1.8b}, [%1],%4               \n"

-    "usubl      v0.8h, v0.8b, v1.8b            \n"

-    MEMACCESS(0)

-    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2

-    MEMACCESS(1)

-    "ld1        {v3.8b}, [%1],%4               \n"

-    "usubl      v1.8h, v2.8b, v3.8b            \n"

-    "add        v0.8h, v0.8h, v1.8h            \n"

-    "add        v0.8h, v0.8h, v1.8h            \n"

-    MEMACCESS(0)

-    "ld1        {v2.8b}, [%0],%5               \n"  // right

-    MEMACCESS(1)

-    "ld1        {v3.8b}, [%1],%5               \n"

-    "subs       %w3, %w3, #8                   \n"  // 8 pixels

-    "usubl      v1.8h, v2.8b, v3.8b            \n"

-    "add        v0.8h, v0.8h, v1.8h            \n"

-    "abs        v0.8h, v0.8h                   \n"

-    "uqxtn      v0.8b, v0.8h                   \n"

-    MEMACCESS(2)

-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely

-    "b.gt       1b                             \n"

-  : "+r"(src_y0),      // %0

-    "+r"(src_y1),      // %1

-    "+r"(dst_sobely),  // %2

-    "+r"(width)        // %3

-  : "r"(1LL),          // %4

-    "r"(6LL)           // %5

-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+void SobelYRow_NEON(const uint8_t* src_y0,

+                    const uint8_t* src_y1,

+                    uint8_t* dst_sobely,

+                    int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v0.8b}, [%0],%4               \n"  // left

+      "ld1        {v1.8b}, [%1],%4               \n"

+      "usubl      v0.8h, v0.8b, v1.8b            \n"

+      "ld1        {v2.8b}, [%0],%4               \n"  // center * 2

+      "ld1        {v3.8b}, [%1],%4               \n"

+      "usubl      v1.8h, v2.8b, v3.8b            \n"

+      "add        v0.8h, v0.8h, v1.8h            \n"

+      "add        v0.8h, v0.8h, v1.8h            \n"

+      "ld1        {v2.8b}, [%0],%5               \n"  // right

+      "ld1        {v3.8b}, [%1],%5               \n"

+      "subs       %w3, %w3, #8                   \n"  // 8 pixels

+      "usubl      v1.8h, v2.8b, v3.8b            \n"

+      "add        v0.8h, v0.8h, v1.8h            \n"

+      "abs        v0.8h, v0.8h                   \n"

+      "uqxtn      v0.8b, v0.8h                   \n"

+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely

+      "b.gt       1b                             \n"

+      : "+r"(src_y0),                           // %0

+        "+r"(src_y1),                           // %1

+        "+r"(dst_sobely),                       // %2

+        "+r"(width)                             // %3

+      : "r"(1LL),                               // %4

+        "r"(6LL)                                // %5

+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List

+      );

+// Caveat - rounds float to half float whereas scaling version truncates.

+void HalfFloat1Row_NEON(const uint16_t* src,

+                        uint16_t* dst,

+                        float /*unused*/,

+                        int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts

+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop

+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's

+      "uxtl2      v3.4s, v1.8h                   \n"

+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats

+      "scvtf      v3.4s, v3.4s                   \n"

+      "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats

+      "fcvtn2     v1.8h, v3.4s                   \n"

+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts

+      "b.gt       1b                             \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      :

+      : "cc", "memory", "v1", "v2", "v3");

+}

+void HalfFloatRow_NEON(const uint16_t* src,

+                       uint16_t* dst,

+                       float scale,

+                       int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts

+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop

+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's

+      "uxtl2      v3.4s, v1.8h                   \n"

+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats

+      "scvtf      v3.4s, v3.4s                   \n"

+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent

+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"

+      "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat

+      "uqshrn2    v1.8h, v3.4s, #13              \n"

+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts

+      "b.gt       1b                             \n"

+      : "+r"(src),                      // %0

+        "+r"(dst),                      // %1

+        "+r"(width)                     // %2

+      : "w"(scale * 1.9259299444e-34f)  // %3

+      : "cc", "memory", "v1", "v2", "v3");

+}

+void ByteToFloatRow_NEON(const uint8_t* src,

+                         float* dst,

+                         float scale,

+                         int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes

+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop

+      "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts

+      "uxtl       v2.4s, v1.4h                   \n"  // 8 ints

+      "uxtl2      v3.4s, v1.8h                   \n"

+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats

+      "scvtf      v3.4s, v3.4s                   \n"

+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale

+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"

+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats

+      "b.gt       1b                             \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      : "w"(scale)   // %3

+      : "cc", "memory", "v1", "v2", "v3");

+}

+float ScaleMaxSamples_NEON(const float* src,

+                           float* dst,

+                           float scale,

+                           int width) {

+  float fmax;

+  asm volatile(

+      "movi       v5.4s, #0                      \n"  // max

+      "movi       v6.4s, #0                      \n"

+      "1:                                        \n"

+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale

+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale

+      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max

+      "fmax       v6.4s, v6.4s, v2.4s            \n"

+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples

+      "b.gt       1b                             \n"

+      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max

+      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator

+      : "+r"(src),                                    // %0

+        "+r"(dst),                                    // %1

+        "+r"(width),                                  // %2

+        "=w"(fmax)                                    // %3

+      : "w"(scale)                                    // %4

+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");

+  return fmax;

+}

+float ScaleSumSamples_NEON(const float* src,

+                           float* dst,

+                           float scale,

+                           int width) {

+  float fsum;

+  asm volatile(

+      "movi       v5.4s, #0                      \n"  // max

+      "movi       v6.4s, #0                      \n"  // max

+      "1:                                        \n"

+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale

+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"

+      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares

+      "fmla       v6.4s, v2.4s, v2.4s            \n"

+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples

+      "b.gt       1b                             \n"

+      "faddp      v5.4s, v5.4s, v6.4s            \n"

+      "faddp      v5.4s, v5.4s, v5.4s            \n"

+      "faddp      %3.4s, v5.4s, v5.4s            \n"  // sum

+      : "+r"(src),                                    // %0

+        "+r"(dst),                                    // %1

+        "+r"(width),                                  // %2

+        "=w"(fsum)                                    // %3

+      : "w"(scale)                                    // %4

+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");

+  return fsum;

+}

+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

+      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale

+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale

+      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples

+      "b.gt       1b                             \n"

+      : "+r"(src),   // %0

+        "+r"(dst),   // %1

+        "+r"(width)  // %2

+      : "w"(scale)   // %3

+      : "cc", "memory", "v1", "v2");

+}

+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.

+void GaussCol_NEON(const uint16_t* src0,

+                   const uint16_t* src1,

+                   const uint16_t* src2,

+                   const uint16_t* src3,

+                   const uint16_t* src4,

+                   uint32_t* dst,

+                   int width) {

+  asm volatile(

+      "movi       v6.8h, #4                      \n"  // constant 4

+      "movi       v7.8h, #6                      \n"  // constant 6

+      "1:                                        \n"

+      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows

+      "ld1        {v2.8h}, [%4], #16             \n"

+      "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1

+      "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1

+      "ld1        {v2.8h}, [%1], #16             \n"

+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4

+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4

+      "ld1        {v2.8h}, [%2], #16             \n"

+      "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6

+      "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6

+      "ld1        {v2.8h}, [%3], #16             \n"

+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4

+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4

+      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop

+      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples

+      "b.gt       1b                             \n"

+      : "+r"(src0),  // %0

+        "+r"(src1),  // %1

+        "+r"(src2),  // %2

+        "+r"(src3),  // %3

+        "+r"(src4),  // %4

+        "+r"(dst),   // %5

+        "+r"(width)  // %6

+      :

+      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");

+}

+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.

+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {

+  const uint32_t* src1 = src + 1;

+  const uint32_t* src2 = src + 2;

+  const uint32_t* src3 = src + 3;

+  asm volatile(

+      "movi       v6.4s, #4                      \n"  // constant 4

+      "movi       v7.4s, #6                      \n"  // constant 6

+      "1:                                        \n"

+      "ld1        {v0.4s,v1.4s,v2.4s}, [%0], %6  \n"  // load 12 source samples

+      "add        v0.4s, v0.4s, v1.4s            \n"  // * 1

+      "add        v1.4s, v1.4s, v2.4s            \n"  // * 1

+      "ld1        {v2.4s,v3.4s}, [%2], #32       \n"

+      "mla        v0.4s, v2.4s, v7.4s            \n"  // * 6

+      "mla        v1.4s, v3.4s, v7.4s            \n"  // * 6

+      "ld1        {v2.4s,v3.4s}, [%1], #32       \n"

+      "ld1        {v4.4s,v5.4s}, [%3], #32       \n"

+      "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4

+      "add        v3.4s, v3.4s, v5.4s            \n"

+      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4

+      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4

+      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop

+      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack

+      "uqrshrn2   v0.8h, v1.4s, #8               \n"

+      "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples

+      "b.gt       1b                             \n"

+      : "+r"(src),   // %0

+        "+r"(src1),  // %1

+        "+r"(src2),  // %2

+        "+r"(src3),  // %3

+        "+r"(dst),   // %4

+        "+r"(width)  // %5

+      : "r"(32LL)    // %6

+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");

+}

 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus

--- a/third_party/libyuv/source/row_win.cc

+++ b/third_party/libyuv/source/row_win.cc

@@ -28,72 +28,71 @@

 #if defined(_M_X64)

 // Read 4 UV from 422, upsample to 8 UV.

-#define READYUV422                                                             \

-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \

-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \

-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \

-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \

-    u_buf += 4;                                                                \

-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \

-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \

-    y_buf += 8;

+#define READYUV422                                        \

+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \

+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \

+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \

+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \

+  u_buf += 4;                                             \

+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \

+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \

+  y_buf += 8;

 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.

-#define READYUVA422                                                            \

-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \

-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \

-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \

-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \

-    u_buf += 4;                                                                \

-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \

-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \

-    y_buf += 8;                                                                \

-    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \

-    a_buf += 8;

+#define READYUVA422                                       \

+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \

+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \

+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \

+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \

+  u_buf += 4;                                             \

+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \

+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \

+  y_buf += 8;                                             \

+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \

+  a_buf += 8;

 // Convert 8 pixels: 8 UV and 8 Y.

-#define YUVTORGB(yuvconstants)                                                 \

-    xmm1 = _mm_loadu_si128(&xmm0);                                             \

-    xmm2 = _mm_loadu_si128(&xmm0);                                             \

-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \

-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \

-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \

-    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \

-    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \

-    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \

-    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \

-    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \

-    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \

-    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \

-    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \

-    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \

-    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \

-    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \

-    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \

-    xmm2 = _mm_packus_epi16(xmm2, xmm2);

+#define YUVTORGB(yuvconstants)                                     \

+  xmm1 = _mm_loadu_si128(&xmm0);                                   \

+  xmm2 = _mm_loadu_si128(&xmm0);                                   \

+  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \

+  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \

+  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \

+  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \

+  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \

+  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \

+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \

+  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \

+  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \

+  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \

+  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \

+  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \

+  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \

+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \

+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \

+  xmm2 = _mm_packus_epi16(xmm2, xmm2);

 // Store 8 ARGB values.

-#define STOREARGB                                                              \

-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \

-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \

-    xmm1 = _mm_loadu_si128(&xmm0);                                             \

-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \

-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \

-    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \

-    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \

-    dst_argb += 32;

+#define STOREARGB                                    \

+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \

+  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \

+  xmm1 = _mm_loadu_si128(&xmm0);                     \

+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \

+  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \

+  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \

+  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \

+  dst_argb += 32;

 #if defined(HAS_I422TOARGBROW_SSSE3)

-void I422ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_argb,

+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,

+                         const uint8_t* u_buf,

+                         const uint8_t* v_buf,

+                         uint8_t* dst_argb,

                          const struct YuvConstants* yuvconstants,

                          int width) {

   __m128i xmm0, xmm1, xmm2, xmm4;

   const __m128i xmm5 = _mm_set1_epi8(-1);

-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;

+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;

   while (width > 0) {

     READYUV422

     YUVTORGB(yuvconstants)

@@ -104,15 +103,15 @@

 #endif

 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)

-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,

-                              const uint8* u_buf,

-                              const uint8* v_buf,

-                              const uint8* a_buf,

-                              uint8* dst_argb,

+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,

+                              const uint8_t* u_buf,

+                              const uint8_t* v_buf,

+                              const uint8_t* a_buf,

+                              uint8_t* dst_argb,

                               const struct YuvConstants* yuvconstants,

                               int width) {

   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;

-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;

+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;

   while (width > 0) {

     READYUVA422

     YUVTORGB(yuvconstants)

@@ -127,175 +126,143 @@

 #ifdef HAS_ARGBTOYROW_SSSE3

 // Constants for ARGB.

-static const vec8 kARGBToY = {

-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0

-};

+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,

+                              13, 65, 33, 0, 13, 65, 33, 0};

 // JPeg full range.

-static const vec8 kARGBToYJ = {

-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0

-};

+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,

+                               15, 75, 38, 0, 15, 75, 38, 0};

-static const vec8 kARGBToU = {

-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0

-};

+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,

+                              112, -74, -38, 0, 112, -74, -38, 0};

-static const vec8 kARGBToUJ = {

-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0

-};

+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,

+                               127, -84, -43, 0, 127, -84, -43, 0};

 static const vec8 kARGBToV = {

-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,

+    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,

};

-static const vec8 kARGBToVJ = {

-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0

-};

+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,

+                               -20, -107, 127, 0, -20, -107, 127, 0};

 // vpshufb for vphaddw + vpackuswb packed to shorts.

 static const lvec8 kShufARGBToUV_AVX = {

-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,

-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15

-};

+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,

+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};

 // Constants for BGRA.

-static const vec8 kBGRAToY = {

-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13

-};

+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,

+                              0, 33, 65, 13, 0, 33, 65, 13};

-static const vec8 kBGRAToU = {

-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112

-};

+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,

+                              0, -38, -74, 112, 0, -38, -74, 112};

-static const vec8 kBGRAToV = {

-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18

-};

+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,

+                              0, 112, -94, -18, 0, 112, -94, -18};

 // Constants for ABGR.

-static const vec8 kABGRToY = {

-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0

-};

+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,

+                              33, 65, 13, 0, 33, 65, 13, 0};

-static const vec8 kABGRToU = {

-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0

-};

+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,

+                              -38, -74, 112, 0, -38, -74, 112, 0};

-static const vec8 kABGRToV = {

-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0

-};

+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,

+                              112, -94, -18, 0, 112, -94, -18, 0};

 // Constants for RGBA.

-static const vec8 kRGBAToY = {

-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33

-};

+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,

+                              0, 13, 65, 33, 0, 13, 65, 33};

-static const vec8 kRGBAToU = {

-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38

-};

+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,

+                              0, 112, -74, -38, 0, 112, -74, -38};

-static const vec8 kRGBAToV = {

-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112

-};

+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,

+                              0, -18, -94, 112, 0, -18, -94, 112};

-static const uvec8 kAddY16 = {

-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u

-};

+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,

+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};

 // 7 bit fixed point 0.5.

-static const vec16 kAddYJ64 = {

-  64, 64, 64, 64, 64, 64, 64, 64

-};

+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};

-static const uvec8 kAddUV128 = {

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,

+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

-static const uvec16 kAddUVJ128 = {

-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u

-};

+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,

+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};

 // Shuffle table for converting RGB24 to ARGB.

 static const uvec8 kShuffleMaskRGB24ToARGB = {

-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u

-};

+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};

 // Shuffle table for converting RAW to ARGB.

-static const uvec8 kShuffleMaskRAWToARGB = {

-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u

-};

+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,

+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};

 // Shuffle table for converting RAW to RGB24.  First 8.

 static const uvec8 kShuffleMaskRAWToRGB24_0 = {

-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,

+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting RAW to RGB24.  Middle 8.

 static const uvec8 kShuffleMaskRAWToRGB24_1 = {

-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,

+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting RAW to RGB24.  Last 8.

 static const uvec8 kShuffleMaskRAWToRGB24_2 = {

-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,

-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u

-};

+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,

+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting ARGB to RGB24.

 static const uvec8 kShuffleMaskARGBToRGB24 = {

-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u

-};

+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting ARGB to RAW.

 static const uvec8 kShuffleMaskARGBToRAW = {

-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u

-};

+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};

 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4

 static const uvec8 kShuffleMaskARGBToRGB24_0 = {

-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u

-};

+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};

 // YUY2 shuf 16 Y to 32 Y.

-static const lvec8 kShuffleYUY2Y = {

-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,

-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14

-};

+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,

+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,

+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};

 // YUY2 shuf 8 UV to 16 UV.

-static const lvec8 kShuffleYUY2UV = {

-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,

-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15

-};

+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,

+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,

+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};

 // UYVY shuf 16 Y to 32 Y.

-static const lvec8 kShuffleUYVYY = {

-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,

-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15

-};

+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,

+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,

+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};

 // UYVY shuf 8 UV to 16 UV.

-static const lvec8 kShuffleUYVYUV = {

-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,

-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14

-};

+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,

+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,

+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};

 // NV21 shuf 8 VU to 16 UV.

 static const lvec8 kShuffleNV21 = {

-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,

};

 // Duplicates gray value 3 times and fills in alpha opaque.

-__declspec(naked)

-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {

+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,

+                                          uint8_t* dst_argb,

+                                          int width) {

   __asm {

-    mov        eax, [esp + 4]        // src_y

-    mov        edx, [esp + 8]        // dst_argb

-    mov        ecx, [esp + 12]       // width

-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000

+    mov        eax, [esp + 4]  // src_y

+    mov        edx, [esp + 8]  // dst_argb

+    mov        ecx, [esp + 12]  // width

+    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000

     pslld      xmm5, 24

   convertloop:

@@ -318,13 +285,14 @@

 #ifdef HAS_J400TOARGBROW_AVX2

 // Duplicates gray value 3 times and fills in alpha opaque.

-__declspec(naked)

-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {

+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,

+                                          uint8_t* dst_argb,

+                                          int width) {

   __asm {

-    mov         eax, [esp + 4]        // src_y

-    mov         edx, [esp + 8]        // dst_argb

-    mov         ecx, [esp + 12]       // width

-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000

+    mov         eax, [esp + 4]  // src_y

+    mov         edx, [esp + 8]  // dst_argb

+    mov         ecx, [esp + 12]  // width

+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000

     vpslld      ymm5, ymm5, 24

   convertloop:

@@ -348,13 +316,14 @@

 #endif  // HAS_J400TOARGBROW_AVX2

-__declspec(naked)

-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {

+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,

+                                            uint8_t* dst_argb,

+                                            int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_rgb24

-    mov       edx, [esp + 8]   // dst_argb

+    mov       eax, [esp + 4]  // src_rgb24

+    mov       edx, [esp + 8]  // dst_argb

     mov       ecx, [esp + 12]  // width

-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000

+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000

     pslld     xmm5, 24

     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB

@@ -364,10 +333,10 @@

     movdqu    xmm3, [eax + 32]

     lea       eax, [eax + 48]

     movdqa    xmm2, xmm3

-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}

+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}

     pshufb    xmm2, xmm4

     por       xmm2, xmm5

-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}

+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}

     pshufb    xmm0, xmm4

     movdqu    [edx + 32], xmm2

     por       xmm0, xmm5

@@ -374,7 +343,7 @@

     pshufb    xmm1, xmm4

     movdqu    [edx], xmm0

     por       xmm1, xmm5

-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}

+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}

     pshufb    xmm3, xmm4

     movdqu    [edx + 16], xmm1

     por       xmm3, xmm5

@@ -386,14 +355,14 @@

-__declspec(naked)

-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,

-                        int width) {

+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,

+                                          uint8_t* dst_argb,

+                                          int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_raw

-    mov       edx, [esp + 8]   // dst_argb

+    mov       eax, [esp + 4]  // src_raw

+    mov       edx, [esp + 8]  // dst_argb

     mov       ecx, [esp + 12]  // width

-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000

+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000

     pslld     xmm5, 24

     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB

@@ -403,10 +372,10 @@

     movdqu    xmm3, [eax + 32]

     lea       eax, [eax + 48]

     movdqa    xmm2, xmm3

-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}

+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}

     pshufb    xmm2, xmm4

     por       xmm2, xmm5

-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}

+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}

     pshufb    xmm0, xmm4

     movdqu    [edx + 32], xmm2

     por       xmm0, xmm5

@@ -413,7 +382,7 @@

     pshufb    xmm1, xmm4

     movdqu    [edx], xmm0

     por       xmm1, xmm5

-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}

+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}

     pshufb    xmm3, xmm4

     movdqu    [edx + 16], xmm1

     por       xmm3, xmm5

@@ -425,11 +394,12 @@

-__declspec(naked)

-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {

+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,

+                                           uint8_t* dst_rgb24,

+                                           int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_raw

-    mov       edx, [esp + 8]   // dst_rgb24

+    mov       eax, [esp + 4]  // src_raw

+    mov       edx, [esp + 8]  // dst_rgb24

     mov       ecx, [esp + 12]  // width

     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0

     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1

@@ -460,9 +430,9 @@

 // v * (256 + 8)

 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3

 // 20 instructions.

-__declspec(naked)

-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,

-                          int width) {

+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,

+                                            uint8_t* dst_argb,

+                                            int width) {

   __asm {

     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits

     movd      xmm5, eax

@@ -470,33 +440,33 @@

     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits

     movd      xmm6, eax

     pshufd    xmm6, xmm6, 0

-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red

+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red

     psllw     xmm3, 11

-    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green

+    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green

     psllw     xmm4, 10

     psrlw     xmm4, 5

-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha

+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha

     psllw     xmm7, 8

-    mov       eax, [esp + 4]   // src_rgb565

-    mov       edx, [esp + 8]   // dst_argb

+    mov       eax, [esp + 4]  // src_rgb565

+    mov       edx, [esp + 8]  // dst_argb

     mov       ecx, [esp + 12]  // width

     sub       edx, eax

     sub       edx, eax

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565

+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565

     movdqa    xmm1, xmm0

     movdqa    xmm2, xmm0

-    pand      xmm1, xmm3    // R in upper 5 bits

-    psllw     xmm2, 11      // B in upper 5 bits

-    pmulhuw   xmm1, xmm5    // * (256 + 8)

-    pmulhuw   xmm2, xmm5    // * (256 + 8)

+    pand      xmm1, xmm3  // R in upper 5 bits

+    psllw     xmm2, 11  // B in upper 5 bits

+    pmulhuw   xmm1, xmm5  // * (256 + 8)

+    pmulhuw   xmm2, xmm5  // * (256 + 8)

     psllw     xmm1, 8

-    por       xmm1, xmm2    // RB

-    pand      xmm0, xmm4    // G in middle 6 bits

-    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)

-    por       xmm0, xmm7    // AG

+    por       xmm1, xmm2  // RB

+    pand      xmm0, xmm4  // G in middle 6 bits

+    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)

+    por       xmm0, xmm7  // AG

     movdqa    xmm2, xmm1

     punpcklbw xmm1, xmm0

     punpckhbw xmm2, xmm0

@@ -516,9 +486,9 @@

 // v * 256 + v * 8

 // v * (256 + 8)

 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3

-__declspec(naked)

-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,

-                          int width) {

+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,

+                                            uint8_t* dst_argb,

+                                            int width) {

   __asm {

     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits

     vmovd      xmm5, eax

@@ -526,32 +496,32 @@

     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits

     vmovd      xmm6, eax

     vbroadcastss ymm6, xmm6

-    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red

+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red

     vpsllw     ymm3, ymm3, 11

-    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green

+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green

     vpsllw     ymm4, ymm4, 10

     vpsrlw     ymm4, ymm4, 5

-    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha

+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha

     vpsllw     ymm7, ymm7, 8

-    mov        eax, [esp + 4]   // src_rgb565

-    mov        edx, [esp + 8]   // dst_argb

+    mov        eax, [esp + 4]  // src_rgb565

+    mov        edx, [esp + 8]  // dst_argb

     mov        ecx, [esp + 12]  // width

     sub        edx, eax

     sub        edx, eax

  convertloop:

-    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565

-    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits

-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits

-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)

-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)

+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565

+    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits

+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits

+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)

+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)

     vpsllw     ymm1, ymm1, 8

-    vpor       ymm1, ymm1, ymm2    // RB

-    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits

-    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)

-    vpor       ymm0, ymm0, ymm7    // AG

-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack

+    vpor       ymm1, ymm1, ymm2  // RB

+    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits

+    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)

+    vpor       ymm0, ymm0, ymm7  // AG

+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack

     vpermq     ymm1, ymm1, 0xd8

     vpunpckhbw ymm2, ymm1, ymm0

     vpunpcklbw ymm1, ymm1, ymm0

@@ -567,9 +537,9 @@

 #endif  // HAS_RGB565TOARGBROW_AVX2

 #ifdef HAS_ARGB1555TOARGBROW_AVX2

-__declspec(naked)

-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,

-                            int width) {

+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,

+                                              uint8_t* dst_argb,

+                                              int width) {

   __asm {

     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits

     vmovd      xmm5, eax

@@ -577,33 +547,33 @@

     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits

     vmovd      xmm6, eax

     vbroadcastss ymm6, xmm6

-    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red

+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red

     vpsllw     ymm3, ymm3, 11

-    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green

-    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha

+    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green

+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha

     vpsllw     ymm7, ymm7, 8

-    mov        eax,  [esp + 4]   // src_argb1555

-    mov        edx,  [esp + 8]   // dst_argb

+    mov        eax,  [esp + 4]  // src_argb1555

+    mov        edx,  [esp + 8]  // dst_argb

     mov        ecx,  [esp + 12]  // width

     sub        edx,  eax

     sub        edx,  eax

  convertloop:

-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555

-    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits

-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits

+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555

+    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits

+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits

     vpand      ymm1, ymm1, ymm3

-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)

-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)

+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)

+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)

     vpsllw     ymm1, ymm1, 8

-    vpor       ymm1, ymm1, ymm2    // RB

-    vpsraw     ymm2, ymm0, 8       // A

-    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits

-    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)

+    vpor       ymm1, ymm1, ymm2  // RB

+    vpsraw     ymm2, ymm0, 8  // A

+    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits

+    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)

     vpand      ymm2, ymm2, ymm7

-    vpor       ymm0, ymm0, ymm2    // AG

-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack

+    vpor       ymm0, ymm0, ymm2  // AG

+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack

     vpermq     ymm1, ymm1, 0xd8

     vpunpckhbw ymm2, ymm1, ymm0

     vpunpcklbw ymm1, ymm1, ymm0

@@ -619,29 +589,29 @@

 #endif  // HAS_ARGB1555TOARGBROW_AVX2

 #ifdef HAS_ARGB4444TOARGBROW_AVX2

-__declspec(naked)

-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,

-                            int width) {

+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,

+                                              uint8_t* dst_argb,

+                                              int width) {

   __asm {

     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f

     vmovd     xmm4, eax

     vbroadcastss ymm4, xmm4

-    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles

-    mov       eax,  [esp + 4]   // src_argb4444

-    mov       edx,  [esp + 8]   // dst_argb

+    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles

+    mov       eax,  [esp + 4]  // src_argb4444

+    mov       edx,  [esp + 8]  // dst_argb

     mov       ecx,  [esp + 12]  // width

     sub       edx,  eax

     sub       edx,  eax

  convertloop:

-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444

-    vpand      ymm2, ymm0, ymm5    // mask high nibbles

-    vpand      ymm0, ymm0, ymm4    // mask low nibbles

+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444

+    vpand      ymm2, ymm0, ymm5  // mask high nibbles

+    vpand      ymm0, ymm0, ymm4  // mask low nibbles

     vpsrlw     ymm3, ymm2, 4

     vpsllw     ymm1, ymm0, 4

     vpor       ymm2, ymm2, ymm3

     vpor       ymm0, ymm0, ymm1

-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack

+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack

     vpermq     ymm2, ymm2, 0xd8

     vpunpckhbw ymm1, ymm0, ymm2

     vpunpcklbw ymm0, ymm0, ymm2

@@ -657,9 +627,9 @@

 #endif  // HAS_ARGB4444TOARGBROW_AVX2

 // 24 instructions

-__declspec(naked)

-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,

-                            int width) {

+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,

+                                              uint8_t* dst_argb,

+                                              int width) {

   __asm {

     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits

     movd      xmm5, eax

@@ -667,36 +637,36 @@

     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits

     movd      xmm6, eax

     pshufd    xmm6, xmm6, 0

-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red

+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red

     psllw     xmm3, 11

-    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green

+    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green

     psrlw     xmm4, 6

-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha

+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha

     psllw     xmm7, 8

-    mov       eax, [esp + 4]   // src_argb1555

-    mov       edx, [esp + 8]   // dst_argb

+    mov       eax, [esp + 4]  // src_argb1555

+    mov       edx, [esp + 8]  // dst_argb

     mov       ecx, [esp + 12]  // width

     sub       edx, eax

     sub       edx, eax

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555

+    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555

     movdqa    xmm1, xmm0

     movdqa    xmm2, xmm0

-    psllw     xmm1, 1       // R in upper 5 bits

-    psllw     xmm2, 11      // B in upper 5 bits

+    psllw     xmm1, 1  // R in upper 5 bits

+    psllw     xmm2, 11  // B in upper 5 bits

     pand      xmm1, xmm3

-    pmulhuw   xmm2, xmm5    // * (256 + 8)

-    pmulhuw   xmm1, xmm5    // * (256 + 8)

+    pmulhuw   xmm2, xmm5  // * (256 + 8)

+    pmulhuw   xmm1, xmm5  // * (256 + 8)

     psllw     xmm1, 8

-    por       xmm1, xmm2    // RB

+    por       xmm1, xmm2  // RB

     movdqa    xmm2, xmm0

-    pand      xmm0, xmm4    // G in middle 5 bits

-    psraw     xmm2, 8       // A

-    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)

+    pand      xmm0, xmm4  // G in middle 5 bits

+    psraw     xmm2, 8  // A

+    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)

     pand      xmm2, xmm7

-    por       xmm0, xmm2    // AG

+    por       xmm0, xmm2  // AG

     movdqa    xmm2, xmm1

     punpcklbw xmm1, xmm0

     punpckhbw xmm2, xmm0

@@ -710,26 +680,26 @@

 // 18 instructions.

-__declspec(naked)

-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,

-                            int width) {

+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,

+                                              uint8_t* dst_argb,

+                                              int width) {

   __asm {

     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f

     movd      xmm4, eax

     pshufd    xmm4, xmm4, 0

-    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles

+    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles

     pslld     xmm5, 4

-    mov       eax, [esp + 4]   // src_argb4444

-    mov       edx, [esp + 8]   // dst_argb

+    mov       eax, [esp + 4]  // src_argb4444

+    mov       edx, [esp + 8]  // dst_argb

     mov       ecx, [esp + 12]  // width

     sub       edx, eax

     sub       edx, eax

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444

+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444

     movdqa    xmm2, xmm0

-    pand      xmm0, xmm4    // mask low nibbles

-    pand      xmm2, xmm5    // mask high nibbles

+    pand      xmm0, xmm4  // mask low nibbles

+    pand      xmm2, xmm5  // mask high nibbles

     movdqa    xmm1, xmm0

     movdqa    xmm3, xmm2

     psllw     xmm1, 4

@@ -748,37 +718,38 @@

-__declspec(naked)

-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,

+                                            uint8_t* dst_rgb,

+                                            int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_argb

-    mov       edx, [esp + 8]   // dst_rgb

+    mov       eax, [esp + 4]  // src_argb

+    mov       edx, [esp + 8]  // dst_rgb

     mov       ecx, [esp + 12]  // width

     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb

+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb

     movdqu    xmm1, [eax + 16]

     movdqu    xmm2, [eax + 32]

     movdqu    xmm3, [eax + 48]

     lea       eax, [eax + 64]

-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB

+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB

     pshufb    xmm1, xmm6

     pshufb    xmm2, xmm6

     pshufb    xmm3, xmm6

-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0

-    psrldq    xmm1, 4      // 8 bytes from 1

-    pslldq    xmm4, 12     // 4 bytes from 1 for 0

-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1

-    por       xmm0, xmm4   // 4 bytes from 1 for 0

-    pslldq    xmm5, 8      // 8 bytes from 2 for 1

+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0

+    psrldq    xmm1, 4  // 8 bytes from 1

+    pslldq    xmm4, 12  // 4 bytes from 1 for 0

+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1

+    por       xmm0, xmm4  // 4 bytes from 1 for 0

+    pslldq    xmm5, 8  // 8 bytes from 2 for 1

     movdqu    [edx], xmm0  // store 0

-    por       xmm1, xmm5   // 8 bytes from 2 for 1

-    psrldq    xmm2, 8      // 4 bytes from 2

-    pslldq    xmm3, 4      // 12 bytes from 3 for 2

-    por       xmm2, xmm3   // 12 bytes from 3 for 2

-    movdqu    [edx + 16], xmm1   // store 1

-    movdqu    [edx + 32], xmm2   // store 2

+    por       xmm1, xmm5  // 8 bytes from 2 for 1

+    psrldq    xmm2, 8  // 4 bytes from 2

+    pslldq    xmm3, 4  // 12 bytes from 3 for 2

+    por       xmm2, xmm3  // 12 bytes from 3 for 2

+    movdqu    [edx + 16], xmm1  // store 1

+    movdqu    [edx + 32], xmm2  // store 2

     lea       edx, [edx + 48]

     sub       ecx, 16

     jg        convertloop

@@ -786,37 +757,38 @@

-__declspec(naked)

-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,

+                                          uint8_t* dst_rgb,

+                                          int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_argb

-    mov       edx, [esp + 8]   // dst_rgb

+    mov       eax, [esp + 4]  // src_argb

+    mov       edx, [esp + 8]  // dst_rgb

     mov       ecx, [esp + 12]  // width

     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb

+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb

     movdqu    xmm1, [eax + 16]

     movdqu    xmm2, [eax + 32]

     movdqu    xmm3, [eax + 48]

     lea       eax, [eax + 64]

-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB

+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB

     pshufb    xmm1, xmm6

     pshufb    xmm2, xmm6

     pshufb    xmm3, xmm6

-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0

-    psrldq    xmm1, 4      // 8 bytes from 1

-    pslldq    xmm4, 12     // 4 bytes from 1 for 0

-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1

-    por       xmm0, xmm4   // 4 bytes from 1 for 0

-    pslldq    xmm5, 8      // 8 bytes from 2 for 1

+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0

+    psrldq    xmm1, 4  // 8 bytes from 1

+    pslldq    xmm4, 12  // 4 bytes from 1 for 0

+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1

+    por       xmm0, xmm4  // 4 bytes from 1 for 0

+    pslldq    xmm5, 8  // 8 bytes from 2 for 1

     movdqu    [edx], xmm0  // store 0

-    por       xmm1, xmm5   // 8 bytes from 2 for 1

-    psrldq    xmm2, 8      // 4 bytes from 2

-    pslldq    xmm3, 4      // 12 bytes from 3 for 2

-    por       xmm2, xmm3   // 12 bytes from 3 for 2

-    movdqu    [edx + 16], xmm1   // store 1

-    movdqu    [edx + 32], xmm2   // store 2

+    por       xmm1, xmm5  // 8 bytes from 2 for 1

+    psrldq    xmm2, 8  // 4 bytes from 2

+    pslldq    xmm3, 4  // 12 bytes from 3 for 2

+    por       xmm2, xmm3  // 12 bytes from 3 for 2

+    movdqu    [edx + 16], xmm1  // store 1

+    movdqu    [edx + 32], xmm2  // store 2

     lea       edx, [edx + 48]

     sub       ecx, 16

     jg        convertloop

@@ -824,33 +796,34 @@

-__declspec(naked)

-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,

+                                            uint8_t* dst_rgb,

+                                            int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_argb

-    mov       edx, [esp + 8]   // dst_rgb

+    mov       eax, [esp + 4]  // src_argb

+    mov       edx, [esp + 8]  // dst_rgb

     mov       ecx, [esp + 12]  // width

-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f

+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f

     psrld     xmm3, 27

-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0

+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0

     psrld     xmm4, 26

     pslld     xmm4, 5

-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800

+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800

     pslld     xmm5, 11

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb

-    movdqa    xmm1, xmm0    // B

-    movdqa    xmm2, xmm0    // G

-    pslld     xmm0, 8       // R

-    psrld     xmm1, 3       // B

-    psrld     xmm2, 5       // G

-    psrad     xmm0, 16      // R

-    pand      xmm1, xmm3    // B

-    pand      xmm2, xmm4    // G

-    pand      xmm0, xmm5    // R

-    por       xmm1, xmm2    // BG

-    por       xmm0, xmm1    // BGR

+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb

+    movdqa    xmm1, xmm0  // B

+    movdqa    xmm2, xmm0  // G

+    pslld     xmm0, 8  // R

+    psrld     xmm1, 3  // B

+    psrld     xmm2, 5  // G

+    psrad     xmm0, 16  // R

+    pand      xmm1, xmm3  // B

+    pand      xmm2, xmm4  // G

+    pand      xmm0, xmm5  // R

+    por       xmm1, xmm2  // BG

+    por       xmm0, xmm1  // BGR

     packssdw  xmm0, xmm0

     lea       eax, [eax + 16]

     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565

@@ -861,41 +834,42 @@

-__declspec(naked)

-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int width) {

+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,

+                                                  uint8_t* dst_rgb,

+                                                  const uint32_t dither4,

+                                                  int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_argb

-    mov       edx, [esp + 8]   // dst_rgb

-    movd      xmm6, [esp + 12] // dither4

+    mov       eax, [esp + 4]  // src_argb

+    mov       edx, [esp + 8]  // dst_rgb

+    movd      xmm6, [esp + 12]  // dither4

     mov       ecx, [esp + 16]  // width

-    punpcklbw xmm6, xmm6       // make dither 16 bytes

+    punpcklbw xmm6, xmm6  // make dither 16 bytes

     movdqa    xmm7, xmm6

     punpcklwd xmm6, xmm6

     punpckhwd xmm7, xmm7

-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f

+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f

     psrld     xmm3, 27

-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0

+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0

     psrld     xmm4, 26

     pslld     xmm4, 5

-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800

+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800

     pslld     xmm5, 11

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb

-    paddusb   xmm0, xmm6    // add dither

-    movdqa    xmm1, xmm0    // B

-    movdqa    xmm2, xmm0    // G

-    pslld     xmm0, 8       // R

-    psrld     xmm1, 3       // B

-    psrld     xmm2, 5       // G

-    psrad     xmm0, 16      // R

-    pand      xmm1, xmm3    // B

-    pand      xmm2, xmm4    // G

-    pand      xmm0, xmm5    // R

-    por       xmm1, xmm2    // BG

-    por       xmm0, xmm1    // BGR

+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb

+    paddusb   xmm0, xmm6  // add dither

+    movdqa    xmm1, xmm0  // B

+    movdqa    xmm2, xmm0  // G

+    pslld     xmm0, 8  // R

+    psrld     xmm1, 3  // B

+    psrld     xmm2, 5  // G

+    psrad     xmm0, 16  // R

+    pand      xmm1, xmm3  // B

+    pand      xmm2, xmm4  // G

+    pand      xmm0, xmm5  // R

+    por       xmm1, xmm2  // BG

+    por       xmm0, xmm1  // BGR

     packssdw  xmm0, xmm0

     lea       eax, [eax + 16]

     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565

@@ -907,39 +881,40 @@

 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2

-__declspec(naked)

-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,

-                                const uint32 dither4, int width) {

+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,

+                                                  uint8_t* dst_rgb,

+                                                  const uint32_t dither4,

+                                                  int width) {

   __asm {

-    mov        eax, [esp + 4]      // src_argb

-    mov        edx, [esp + 8]      // dst_rgb

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_rgb

     vbroadcastss xmm6, [esp + 12]  // dither4

-    mov        ecx, [esp + 16]     // width

-    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes

+    mov        ecx, [esp + 16]  // width

+    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes

     vpermq     ymm6, ymm6, 0xd8

     vpunpcklwd ymm6, ymm6, ymm6

-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f

+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f

     vpsrld     ymm3, ymm3, 27

-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0

+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0

     vpsrld     ymm4, ymm4, 26

     vpslld     ymm4, ymm4, 5

-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800

+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800

  convertloop:

-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb

-    vpaddusb   ymm0, ymm0, ymm6    // add dither

-    vpsrld     ymm2, ymm0, 5       // G

-    vpsrld     ymm1, ymm0, 3       // B

-    vpsrld     ymm0, ymm0, 8       // R

-    vpand      ymm2, ymm2, ymm4    // G

-    vpand      ymm1, ymm1, ymm3    // B

-    vpand      ymm0, ymm0, ymm5    // R

-    vpor       ymm1, ymm1, ymm2    // BG

-    vpor       ymm0, ymm0, ymm1    // BGR

+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb

+    vpaddusb   ymm0, ymm0, ymm6  // add dither

+    vpsrld     ymm2, ymm0, 5  // G

+    vpsrld     ymm1, ymm0, 3  // B

+    vpsrld     ymm0, ymm0, 8  // R

+    vpand      ymm2, ymm2, ymm4  // G

+    vpand      ymm1, ymm1, ymm3  // B

+    vpand      ymm0, ymm0, ymm5  // R

+    vpor       ymm1, ymm1, ymm2  // BG

+    vpor       ymm0, ymm0, ymm1  // BGR

     vpackusdw  ymm0, ymm0, ymm0

     vpermq     ymm0, ymm0, 0xd8

     lea        eax, [eax + 32]

-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565

+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565

     lea        edx, [edx + 16]

     sub        ecx, 8

     jg         convertloop

@@ -950,37 +925,38 @@

 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2

 // TODO(fbarchard): Improve sign extension/packing.

-__declspec(naked)

-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,

+                                              uint8_t* dst_rgb,

+                                              int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_argb

-    mov       edx, [esp + 8]   // dst_rgb

+    mov       eax, [esp + 4]  // src_argb

+    mov       edx, [esp + 8]  // dst_rgb

     mov       ecx, [esp + 12]  // width

-    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f

+    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f

     psrld     xmm4, 27

-    movdqa    xmm5, xmm4       // generate mask 0x000003e0

+    movdqa    xmm5, xmm4  // generate mask 0x000003e0

     pslld     xmm5, 5

-    movdqa    xmm6, xmm4       // generate mask 0x00007c00

+    movdqa    xmm6, xmm4  // generate mask 0x00007c00

     pslld     xmm6, 10

-    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000

+    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000

     pslld     xmm7, 15

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb

-    movdqa    xmm1, xmm0    // B

-    movdqa    xmm2, xmm0    // G

-    movdqa    xmm3, xmm0    // R

-    psrad     xmm0, 16      // A

-    psrld     xmm1, 3       // B

-    psrld     xmm2, 6       // G

-    psrld     xmm3, 9       // R

-    pand      xmm0, xmm7    // A

-    pand      xmm1, xmm4    // B

-    pand      xmm2, xmm5    // G

-    pand      xmm3, xmm6    // R

-    por       xmm0, xmm1    // BA

-    por       xmm2, xmm3    // GR

-    por       xmm0, xmm2    // BGRA

+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb

+    movdqa    xmm1, xmm0  // B

+    movdqa    xmm2, xmm0  // G

+    movdqa    xmm3, xmm0  // R

+    psrad     xmm0, 16  // A

+    psrld     xmm1, 3  // B

+    psrld     xmm2, 6  // G

+    psrld     xmm3, 9  // R

+    pand      xmm0, xmm7  // A

+    pand      xmm1, xmm4  // B

+    pand      xmm2, xmm5  // G

+    pand      xmm3, xmm6  // R

+    por       xmm0, xmm1  // BA

+    por       xmm2, xmm3  // GR

+    por       xmm0, xmm2  // BGRA

     packssdw  xmm0, xmm0

     lea       eax, [eax + 16]

     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555

@@ -991,22 +967,23 @@

-__declspec(naked)

-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,

+                                              uint8_t* dst_rgb,

+                                              int width) {

   __asm {

-    mov       eax, [esp + 4]   // src_argb

-    mov       edx, [esp + 8]   // dst_rgb

+    mov       eax, [esp + 4]  // src_argb

+    mov       edx, [esp + 8]  // dst_rgb

     mov       ecx, [esp + 12]  // width

-    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000

+    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000

     psllw     xmm4, 12

-    movdqa    xmm3, xmm4       // generate mask 0x00f000f0

+    movdqa    xmm3, xmm4  // generate mask 0x00f000f0

     psrlw     xmm3, 8

  convertloop:

-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb

+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb

     movdqa    xmm1, xmm0

-    pand      xmm0, xmm3    // low nibble

-    pand      xmm1, xmm4    // high nibble

+    pand      xmm0, xmm3  // low nibble

+    pand      xmm1, xmm4  // high nibble

     psrld     xmm0, 4

     psrld     xmm1, 8

     por       xmm0, xmm1

@@ -1021,33 +998,34 @@

 #ifdef HAS_ARGBTORGB565ROW_AVX2

-__declspec(naked)

-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,

+                                            uint8_t* dst_rgb,

+                                            int width) {

   __asm {

-    mov        eax, [esp + 4]      // src_argb

-    mov        edx, [esp + 8]      // dst_rgb

-    mov        ecx, [esp + 12]     // width

-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_rgb

+    mov        ecx, [esp + 12]  // width

+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f

     vpsrld     ymm3, ymm3, 27

-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0

+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0

     vpsrld     ymm4, ymm4, 26

     vpslld     ymm4, ymm4, 5

-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800

+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800

  convertloop:

-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb

-    vpsrld     ymm2, ymm0, 5       // G

-    vpsrld     ymm1, ymm0, 3       // B

-    vpsrld     ymm0, ymm0, 8       // R

-    vpand      ymm2, ymm2, ymm4    // G

-    vpand      ymm1, ymm1, ymm3    // B

-    vpand      ymm0, ymm0, ymm5    // R

-    vpor       ymm1, ymm1, ymm2    // BG

-    vpor       ymm0, ymm0, ymm1    // BGR

+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb

+    vpsrld     ymm2, ymm0, 5  // G

+    vpsrld     ymm1, ymm0, 3  // B

+    vpsrld     ymm0, ymm0, 8  // R

+    vpand      ymm2, ymm2, ymm4  // G

+    vpand      ymm1, ymm1, ymm3  // B

+    vpand      ymm0, ymm0, ymm5  // R

+    vpor       ymm1, ymm1, ymm2  // BG

+    vpor       ymm0, ymm0, ymm1  // BGR

     vpackusdw  ymm0, ymm0, ymm0

     vpermq     ymm0, ymm0, 0xd8

     lea        eax, [eax + 32]

-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565

+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565

     lea        edx, [edx + 16]

     sub        ecx, 8

     jg         convertloop

@@ -1058,36 +1036,37 @@

 #endif  // HAS_ARGBTORGB565ROW_AVX2

 #ifdef HAS_ARGBTOARGB1555ROW_AVX2

-__declspec(naked)

-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,

+                                              uint8_t* dst_rgb,

+                                              int width) {

   __asm {

-    mov        eax, [esp + 4]      // src_argb

-    mov        edx, [esp + 8]      // dst_rgb

-    mov        ecx, [esp + 12]     // width

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_rgb

+    mov        ecx, [esp + 12]  // width

     vpcmpeqb   ymm4, ymm4, ymm4

-    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f

-    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0

-    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00

-    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000

+    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f

+    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0

+    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00

+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000

     vpslld     ymm7, ymm7, 15

  convertloop:

-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb

-    vpsrld     ymm3, ymm0, 9       // R

-    vpsrld     ymm2, ymm0, 6       // G

-    vpsrld     ymm1, ymm0, 3       // B

-    vpsrad     ymm0, ymm0, 16      // A

-    vpand      ymm3, ymm3, ymm6    // R

-    vpand      ymm2, ymm2, ymm5    // G

-    vpand      ymm1, ymm1, ymm4    // B

-    vpand      ymm0, ymm0, ymm7    // A

-    vpor       ymm0, ymm0, ymm1    // BA

-    vpor       ymm2, ymm2, ymm3    // GR

-    vpor       ymm0, ymm0, ymm2    // BGRA

+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb

+    vpsrld     ymm3, ymm0, 9  // R

+    vpsrld     ymm2, ymm0, 6  // G

+    vpsrld     ymm1, ymm0, 3  // B

+    vpsrad     ymm0, ymm0, 16  // A

+    vpand      ymm3, ymm3, ymm6  // R

+    vpand      ymm2, ymm2, ymm5  // G

+    vpand      ymm1, ymm1, ymm4  // B

+    vpand      ymm0, ymm0, ymm7  // A

+    vpor       ymm0, ymm0, ymm1  // BA

+    vpor       ymm2, ymm2, ymm3  // GR

+    vpor       ymm0, ymm0, ymm2  // BGRA

     vpackssdw  ymm0, ymm0, ymm0

     vpermq     ymm0, ymm0, 0xd8

     lea        eax, [eax + 32]

-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555

+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555

     lea        edx, [edx + 16]

     sub        ecx, 8

     jg         convertloop

@@ -1098,20 +1077,21 @@

 #endif  // HAS_ARGBTOARGB1555ROW_AVX2

 #ifdef HAS_ARGBTOARGB4444ROW_AVX2

-__declspec(naked)

-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {

+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,

+                                              uint8_t* dst_rgb,

+                                              int width) {

   __asm {

-    mov        eax, [esp + 4]   // src_argb

-    mov        edx, [esp + 8]   // dst_rgb

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_rgb

     mov        ecx, [esp + 12]  // width

-    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000

+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000

     vpsllw     ymm4, ymm4, 12

-    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0

+    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0

  convertloop:

-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb

-    vpand      ymm1, ymm0, ymm4    // high nibble

-    vpand      ymm0, ymm0, ymm3    // low nibble

+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb

+    vpand      ymm1, ymm0, ymm4  // high nibble

+    vpand      ymm0, ymm0, ymm3  // low nibble

     vpsrld     ymm1, ymm1, 8

     vpsrld     ymm0, ymm0, 4

     vpor       ymm0, ymm0, ymm1

@@ -1118,7 +1098,7 @@

     vpackuswb  ymm0, ymm0, ymm0

     vpermq     ymm0, ymm0, 0xd8

     lea        eax, [eax + 32]

-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444

+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444

     lea        edx, [edx + 16]

     sub        ecx, 8

     jg         convertloop

@@ -1129,12 +1109,13 @@

 #endif  // HAS_ARGBTOARGB4444ROW_AVX2

 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.

-__declspec(naked)

-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,

+                                        uint8_t* dst_y,

+                                        int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_y */

+    mov        ecx, [esp + 12] /* width */

     movdqa     xmm4, xmmword ptr kARGBToY

     movdqa     xmm5, xmmword ptr kAddY16

@@ -1164,12 +1145,13 @@

 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.

 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.

-__declspec(naked)

-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,

+                                         uint8_t* dst_y,

+                                         int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_y */

+    mov        ecx, [esp + 12] /* width */

     movdqa     xmm4, xmmword ptr kARGBToYJ

     movdqa     xmm5, xmmword ptr kAddYJ64

@@ -1200,17 +1182,16 @@

 #ifdef HAS_ARGBTOYROW_AVX2

 // vpermd for vphaddw + vpackuswb vpermd.

-static const lvec32 kPermdARGBToY_AVX = {

-  0, 4, 1, 5, 2, 6, 3, 7

-};

+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

-__declspec(naked)

-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,

+                                       uint8_t* dst_y,

+                                       int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_y */

+    mov        ecx, [esp + 12] /* width */

     vbroadcastf128 ymm4, xmmword ptr kARGBToY

     vbroadcastf128 ymm5, xmmword ptr kAddY16

     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX

@@ -1244,12 +1225,13 @@

 #ifdef HAS_ARGBTOYJROW_AVX2

 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.

-__declspec(naked)

-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {

+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,

+                                        uint8_t* dst_y,

+                                        int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_y */

+    mov        ecx, [esp + 12] /* width */

     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ

     vbroadcastf128 ymm5, xmmword ptr kAddYJ64

     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX

@@ -1283,12 +1265,13 @@

 #endif  //  HAS_ARGBTOYJROW_AVX2

-__declspec(naked)

-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,

+                                        uint8_t* dst_y,

+                                        int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_y */

+    mov        ecx, [esp + 12] /* width */

     movdqa     xmm4, xmmword ptr kBGRAToY

     movdqa     xmm5, xmmword ptr kAddY16

@@ -1316,12 +1299,13 @@

-__declspec(naked)

-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,

+                                        uint8_t* dst_y,

+                                        int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_y */

+    mov        ecx, [esp + 12] /* width */

     movdqa     xmm4, xmmword ptr kABGRToY

     movdqa     xmm5, xmmword ptr kAddY16

@@ -1349,12 +1333,13 @@

-__declspec(naked)

-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {

+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,

+                                        uint8_t* dst_y,

+                                        int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_y */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_y */

+    mov        ecx, [esp + 12] /* width */

     movdqa     xmm4, xmmword ptr kRGBAToY

     movdqa     xmm5, xmmword ptr kAddY16

@@ -1382,14 +1367,16 @@

-__declspec(naked)

-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,

+                                         int src_stride_argb,

+                                         uint8_t* dst_u,

+                                         uint8_t* dst_v,

+                                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb

-    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        eax, [esp + 8 + 4]  // src_argb

+    mov        esi, [esp + 8 + 8]  // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

     mov        ecx, [esp + 8 + 20]  // width

@@ -1396,10 +1383,10 @@

     movdqa     xmm5, xmmword ptr kAddUV128

     movdqa     xmm6, xmmword ptr kARGBToV

     movdqa     xmm7, xmmword ptr kARGBToU

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx  // stride from u to v

  convertloop:

-    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+         /* step 1 - subsample 16x2 argb pixels to 8x1 */

     movdqu     xmm0, [eax]

     movdqu     xmm4, [eax + esi]

     pavgb      xmm0, xmm4

@@ -1423,9 +1410,9 @@

     shufps     xmm4, xmm3, 0xdd

     pavgb      xmm2, xmm4

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+        // step 2 - convert to U and V

+        // from here down is very similar to Y code except

+        // instead of 16 different pixels, its 8 pixels of U and 8 of V

     movdqa     xmm1, xmm0

     movdqa     xmm3, xmm2

     pmaddubsw  xmm0, xmm7  // U

@@ -1437,11 +1424,11 @@

     psraw      xmm0, 8

     psraw      xmm1, 8

     packsswb   xmm0, xmm1

-    paddb      xmm0, xmm5            // -> unsigned

+    paddb      xmm0, xmm5  // -> unsigned

-    // step 3 - store 8 U and 8 V values

-    movlps     qword ptr [edx], xmm0 // U

-    movhps     qword ptr [edx + edi], xmm0 // V

+        // step 3 - store 8 U and 8 V values

+    movlps     qword ptr [edx], xmm0  // U

+    movhps     qword ptr [edx + edi], xmm0  // V

     lea        edx, [edx + 8]

     sub        ecx, 16

     jg         convertloop

@@ -1452,14 +1439,16 @@

-__declspec(naked)

-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                        uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,

+                                          int src_stride_argb,

+                                          uint8_t* dst_u,

+                                          uint8_t* dst_v,

+                                          int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb

-    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        eax, [esp + 8 + 4]  // src_argb

+    mov        esi, [esp + 8 + 8]  // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

     mov        ecx, [esp + 8 + 20]  // width

@@ -1466,10 +1455,10 @@

     movdqa     xmm5, xmmword ptr kAddUVJ128

     movdqa     xmm6, xmmword ptr kARGBToVJ

     movdqa     xmm7, xmmword ptr kARGBToUJ

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx  // stride from u to v

  convertloop:

-    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+         /* step 1 - subsample 16x2 argb pixels to 8x1 */

     movdqu     xmm0, [eax]

     movdqu     xmm4, [eax + esi]

     pavgb      xmm0, xmm4

@@ -1493,9 +1482,9 @@

     shufps     xmm4, xmm3, 0xdd

     pavgb      xmm2, xmm4

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+        // step 2 - convert to U and V

+        // from here down is very similar to Y code except

+        // instead of 16 different pixels, its 8 pixels of U and 8 of V

     movdqa     xmm1, xmm0

     movdqa     xmm3, xmm2

     pmaddubsw  xmm0, xmm7  // U

@@ -1510,9 +1499,9 @@

     psraw      xmm1, 8

     packsswb   xmm0, xmm1

-    // step 3 - store 8 U and 8 V values

-    movlps     qword ptr [edx], xmm0 // U

-    movhps     qword ptr [edx + edi], xmm0 // V

+        // step 3 - store 8 U and 8 V values

+    movlps     qword ptr [edx], xmm0  // U

+    movhps     qword ptr [edx + edi], xmm0  // V

     lea        edx, [edx + 8]

     sub        ecx, 16

     jg         convertloop

@@ -1524,14 +1513,16 @@

 #ifdef HAS_ARGBTOUVROW_AVX2

-__declspec(naked)

-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,

+                                        int src_stride_argb,

+                                        uint8_t* dst_u,

+                                        uint8_t* dst_v,

+                                        int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb

-    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        eax, [esp + 8 + 4]  // src_argb

+    mov        esi, [esp + 8 + 8]  // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

     mov        ecx, [esp + 8 + 20]  // width

@@ -1538,10 +1529,10 @@

     vbroadcastf128 ymm5, xmmword ptr kAddUV128

     vbroadcastf128 ymm6, xmmword ptr kARGBToV

     vbroadcastf128 ymm7, xmmword ptr kARGBToU

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx   // stride from u to v

  convertloop:

-    /* step 1 - subsample 32x2 argb pixels to 16x1 */

+        /* step 1 - subsample 32x2 argb pixels to 16x1 */

     vmovdqu    ymm0, [eax]

     vmovdqu    ymm1, [eax + 32]

     vmovdqu    ymm2, [eax + 64]

@@ -1558,9 +1549,9 @@

     vshufps    ymm2, ymm2, ymm3, 0xdd

     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 32 different pixels, its 16 pixels of U and 16 of V

+        // step 2 - convert to U and V

+        // from here down is very similar to Y code except

+        // instead of 32 different pixels, its 16 pixels of U and 16 of V

     vpmaddubsw ymm1, ymm0, ymm7  // U

     vpmaddubsw ymm3, ymm2, ymm7

     vpmaddubsw ymm0, ymm0, ymm6  // V

@@ -1574,9 +1565,9 @@

     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw

     vpaddb     ymm0, ymm0, ymm5  // -> unsigned

-    // step 3 - store 16 U and 16 V values

-    vextractf128 [edx], ymm0, 0 // U

-    vextractf128 [edx + edi], ymm0, 1 // V

+        // step 3 - store 16 U and 16 V values

+    vextractf128 [edx], ymm0, 0  // U

+    vextractf128 [edx + edi], ymm0, 1  // V

     lea        edx, [edx + 16]

     sub        ecx, 32

     jg         convertloop

@@ -1590,14 +1581,16 @@

 #endif  // HAS_ARGBTOUVROW_AVX2

 #ifdef HAS_ARGBTOUVJROW_AVX2

-__declspec(naked)

-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,

-                      uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,

+                                         int src_stride_argb,

+                                         uint8_t* dst_u,

+                                         uint8_t* dst_v,

+                                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb

-    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        eax, [esp + 8 + 4]  // src_argb

+    mov        esi, [esp + 8 + 8]  // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

     mov        ecx, [esp + 8 + 20]  // width

@@ -1604,10 +1597,10 @@

     vbroadcastf128 ymm5, xmmword ptr kAddUV128

     vbroadcastf128 ymm6, xmmword ptr kARGBToV

     vbroadcastf128 ymm7, xmmword ptr kARGBToU

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx   // stride from u to v

  convertloop:

-    /* step 1 - subsample 32x2 argb pixels to 16x1 */

+        /* step 1 - subsample 32x2 argb pixels to 16x1 */

     vmovdqu    ymm0, [eax]

     vmovdqu    ymm1, [eax + 32]

     vmovdqu    ymm2, [eax + 64]

@@ -1624,9 +1617,9 @@

     vshufps    ymm2, ymm2, ymm3, 0xdd

     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 32 different pixels, its 16 pixels of U and 16 of V

+        // step 2 - convert to U and V

+        // from here down is very similar to Y code except

+        // instead of 32 different pixels, its 16 pixels of U and 16 of V

     vpmaddubsw ymm1, ymm0, ymm7  // U

     vpmaddubsw ymm3, ymm2, ymm7

     vpmaddubsw ymm0, ymm0, ymm6  // V

@@ -1641,9 +1634,9 @@

     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb

     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw

-    // step 3 - store 16 U and 16 V values

-    vextractf128 [edx], ymm0, 0 // U

-    vextractf128 [edx + edi], ymm0, 1 // V

+        // step 3 - store 16 U and 16 V values

+    vextractf128 [edx], ymm0, 0  // U

+    vextractf128 [edx + edi], ymm0, 1  // V

     lea        edx, [edx + 16]

     sub        ecx, 32

     jg         convertloop

@@ -1656,23 +1649,24 @@

 #endif  // HAS_ARGBTOUVJROW_AVX2

-__declspec(naked)

-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,

-                          uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,

+                                            uint8_t* dst_u,

+                                            uint8_t* dst_v,

+                                            int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]   // src_argb

-    mov        edx, [esp + 4 + 8]   // dst_u

+    mov        eax, [esp + 4 + 4]  // src_argb

+    mov        edx, [esp + 4 + 8]  // dst_u

     mov        edi, [esp + 4 + 12]  // dst_v

     mov        ecx, [esp + 4 + 16]  // width

     movdqa     xmm5, xmmword ptr kAddUV128

     movdqa     xmm6, xmmword ptr kARGBToV

     movdqa     xmm7, xmmword ptr kARGBToU

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx    // stride from u to v

  convertloop:

-    /* convert to U and V */

-    movdqu     xmm0, [eax]          // U

+        /* convert to U and V */

+    movdqu     xmm0, [eax]  // U

     movdqu     xmm1, [eax + 16]

     movdqu     xmm2, [eax + 32]

     movdqu     xmm3, [eax + 48]

@@ -1688,7 +1682,7 @@

     paddb      xmm0, xmm5

     movdqu     [edx], xmm0

-    movdqu     xmm0, [eax]          // V

+    movdqu     xmm0, [eax]  // V

     movdqu     xmm1, [eax + 16]

     movdqu     xmm2, [eax + 32]

     movdqu     xmm3, [eax + 48]

@@ -1713,14 +1707,16 @@

-__declspec(naked)

-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,

+                                         int src_stride_argb,

+                                         uint8_t* dst_u,

+                                         uint8_t* dst_v,

+                                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb

-    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        eax, [esp + 8 + 4]  // src_argb

+    mov        esi, [esp + 8 + 8]  // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

     mov        ecx, [esp + 8 + 20]  // width

@@ -1727,10 +1723,10 @@

     movdqa     xmm5, xmmword ptr kAddUV128

     movdqa     xmm6, xmmword ptr kBGRAToV

     movdqa     xmm7, xmmword ptr kBGRAToU

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx  // stride from u to v

  convertloop:

-    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+         /* step 1 - subsample 16x2 argb pixels to 8x1 */

     movdqu     xmm0, [eax]

     movdqu     xmm4, [eax + esi]

     pavgb      xmm0, xmm4

@@ -1754,9 +1750,9 @@

     shufps     xmm4, xmm3, 0xdd

     pavgb      xmm2, xmm4

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+        // step 2 - convert to U and V

+        // from here down is very similar to Y code except

+        // instead of 16 different pixels, its 8 pixels of U and 8 of V

     movdqa     xmm1, xmm0

     movdqa     xmm3, xmm2

     pmaddubsw  xmm0, xmm7  // U

@@ -1768,11 +1764,11 @@

     psraw      xmm0, 8

     psraw      xmm1, 8

     packsswb   xmm0, xmm1

-    paddb      xmm0, xmm5            // -> unsigned

+    paddb      xmm0, xmm5  // -> unsigned

-    // step 3 - store 8 U and 8 V values

-    movlps     qword ptr [edx], xmm0 // U

-    movhps     qword ptr [edx + edi], xmm0 // V

+        // step 3 - store 8 U and 8 V values

+    movlps     qword ptr [edx], xmm0  // U

+    movhps     qword ptr [edx + edi], xmm0  // V

     lea        edx, [edx + 8]

     sub        ecx, 16

     jg         convertloop

@@ -1783,14 +1779,16 @@

-__declspec(naked)

-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,

+                                         int src_stride_argb,

+                                         uint8_t* dst_u,

+                                         uint8_t* dst_v,

+                                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb

-    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        eax, [esp + 8 + 4]  // src_argb

+    mov        esi, [esp + 8 + 8]  // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

     mov        ecx, [esp + 8 + 20]  // width

@@ -1797,10 +1795,10 @@

     movdqa     xmm5, xmmword ptr kAddUV128

     movdqa     xmm6, xmmword ptr kABGRToV

     movdqa     xmm7, xmmword ptr kABGRToU

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx  // stride from u to v

  convertloop:

-    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+         /* step 1 - subsample 16x2 argb pixels to 8x1 */

     movdqu     xmm0, [eax]

     movdqu     xmm4, [eax + esi]

     pavgb      xmm0, xmm4

@@ -1824,9 +1822,9 @@

     shufps     xmm4, xmm3, 0xdd

     pavgb      xmm2, xmm4

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+        // step 2 - convert to U and V

+        // from here down is very similar to Y code except

+        // instead of 16 different pixels, its 8 pixels of U and 8 of V

     movdqa     xmm1, xmm0

     movdqa     xmm3, xmm2

     pmaddubsw  xmm0, xmm7  // U

@@ -1838,11 +1836,11 @@

     psraw      xmm0, 8

     psraw      xmm1, 8

     packsswb   xmm0, xmm1

-    paddb      xmm0, xmm5            // -> unsigned

+    paddb      xmm0, xmm5  // -> unsigned

-    // step 3 - store 8 U and 8 V values

-    movlps     qword ptr [edx], xmm0 // U

-    movhps     qword ptr [edx + edi], xmm0 // V

+        // step 3 - store 8 U and 8 V values

+    movlps     qword ptr [edx], xmm0  // U

+    movhps     qword ptr [edx + edi], xmm0  // V

     lea        edx, [edx + 8]

     sub        ecx, 16

     jg         convertloop

@@ -1853,14 +1851,16 @@

-__declspec(naked)

-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,

-                       uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,

+                                         int src_stride_argb,

+                                         uint8_t* dst_u,

+                                         uint8_t* dst_v,

+                                         int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_argb

-    mov        esi, [esp + 8 + 8]   // src_stride_argb

+    mov        eax, [esp + 8 + 4]  // src_argb

+    mov        esi, [esp + 8 + 8]  // src_stride_argb

     mov        edx, [esp + 8 + 12]  // dst_u

     mov        edi, [esp + 8 + 16]  // dst_v

     mov        ecx, [esp + 8 + 20]  // width

@@ -1867,10 +1867,10 @@

     movdqa     xmm5, xmmword ptr kAddUV128

     movdqa     xmm6, xmmword ptr kRGBAToV

     movdqa     xmm7, xmmword ptr kRGBAToU

-    sub        edi, edx             // stride from u to v

+    sub        edi, edx  // stride from u to v

  convertloop:

-    /* step 1 - subsample 16x2 argb pixels to 8x1 */

+         /* step 1 - subsample 16x2 argb pixels to 8x1 */

     movdqu     xmm0, [eax]

     movdqu     xmm4, [eax + esi]

     pavgb      xmm0, xmm4

@@ -1894,9 +1894,9 @@

     shufps     xmm4, xmm3, 0xdd

     pavgb      xmm2, xmm4

-    // step 2 - convert to U and V

-    // from here down is very similar to Y code except

-    // instead of 16 different pixels, its 8 pixels of U and 8 of V

+        // step 2 - convert to U and V

+        // from here down is very similar to Y code except

+        // instead of 16 different pixels, its 8 pixels of U and 8 of V

     movdqa     xmm1, xmm0

     movdqa     xmm3, xmm2

     pmaddubsw  xmm0, xmm7  // U

@@ -1908,11 +1908,11 @@

     psraw      xmm0, 8

     psraw      xmm1, 8

     packsswb   xmm0, xmm1

-    paddb      xmm0, xmm5            // -> unsigned

+    paddb      xmm0, xmm5  // -> unsigned

-    // step 3 - store 8 U and 8 V values

-    movlps     qword ptr [edx], xmm0 // U

-    movhps     qword ptr [edx + edi], xmm0 // V

+        // step 3 - store 8 U and 8 V values

+    movlps     qword ptr [edx], xmm0  // U

+    movhps     qword ptr [edx + edi], xmm0  // V

     lea        edx, [edx + 8]

     sub        ecx, 16

     jg         convertloop

@@ -1925,109 +1925,95 @@

 #endif  // HAS_ARGBTOYROW_SSSE3

 // Read 16 UV from 444

-#define READYUV444_AVX2 __asm {                                                \

-    __asm vmovdqu    xmm0, [esi]                  /* U */                      \

-    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \

+#define READYUV444_AVX2 \

+  __asm {                                                \

+    __asm vmovdqu    xmm0, [esi] /* U */                      \

+    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \

     __asm lea        esi,  [esi + 16]                                          \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

     __asm vpermq     ymm1, ymm1, 0xd8                                          \

-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \

+    __asm vmovdqu    xmm4, [eax] /* Y */                      \

     __asm vpermq     ymm4, ymm4, 0xd8                                          \

     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

-    __asm lea        eax, [eax + 16]                                           \

-  }

+    __asm lea        eax, [eax + 16]}

 // Read 8 UV from 422, upsample to 16 UV.

-#define READYUV422_AVX2 __asm {                                                \

-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \

-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \

+#define READYUV422_AVX2 \

+  __asm {                                                \

+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \

+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \

     __asm lea        esi,  [esi + 8]                                           \

-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \

+    __asm vmovdqu    xmm4, [eax] /* Y */                      \

     __asm vpermq     ymm4, ymm4, 0xd8                                          \

     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

-    __asm lea        eax, [eax + 16]                                           \

-  }

+    __asm lea        eax, [eax + 16]}

 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.

-#define READYUVA422_AVX2 __asm {                                               \

-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \

-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \

+#define READYUVA422_AVX2 \

+  __asm {                                               \

+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \

+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \

     __asm lea        esi,  [esi + 8]                                           \

-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \

+    __asm vmovdqu    xmm4, [eax] /* Y */                      \

     __asm vpermq     ymm4, ymm4, 0xd8                                          \

     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

     __asm lea        eax, [eax + 16]                                           \

-    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \

+    __asm vmovdqu    xmm5, [ebp] /* A */                      \

     __asm vpermq     ymm5, ymm5, 0xd8                                          \

-    __asm lea        ebp, [ebp + 16]                                           \

-  }

+    __asm lea        ebp, [ebp + 16]}

-// Read 4 UV from 411, upsample to 16 UV.

-#define READYUV411_AVX2 __asm {                                                \

-    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \

-    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \

-    __asm lea        esi,  [esi + 4]                                           \

-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \

-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

-    __asm vpermq     ymm0, ymm0, 0xd8                                          \

-    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \

-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

-    __asm vpermq     ymm4, ymm4, 0xd8                                          \

-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

-    __asm lea        eax, [eax + 16]                                           \

-  }

 // Read 8 UV from NV12, upsample to 16 UV.

-#define READNV12_AVX2 __asm {                                                  \

-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \

+#define READNV12_AVX2 \

+  __asm {                                                  \

+    __asm vmovdqu    xmm0, [esi] /* UV */                     \

     __asm lea        esi,  [esi + 16]                                          \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \

-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \

+    __asm vmovdqu    xmm4, [eax] /* Y */                      \

     __asm vpermq     ymm4, ymm4, 0xd8                                          \

     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

-    __asm lea        eax, [eax + 16]                                           \

-  }

+    __asm lea        eax, [eax + 16]}

 // Read 8 UV from NV21, upsample to 16 UV.

-#define READNV21_AVX2 __asm {                                                  \

-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \

+#define READNV21_AVX2 \

+  __asm {                                                  \

+    __asm vmovdqu    xmm0, [esi] /* UV */                     \

     __asm lea        esi,  [esi + 16]                                          \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \

-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \

+    __asm vmovdqu    xmm4, [eax] /* Y */                      \

     __asm vpermq     ymm4, ymm4, 0xd8                                          \

     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \

-    __asm lea        eax, [eax + 16]                                           \

-  }

+    __asm lea        eax, [eax + 16]}

 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.

-#define READYUY2_AVX2 __asm {                                                  \

-    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \

+#define READYUY2_AVX2 \

+  __asm {                                                  \

+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \

     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \

-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \

+    __asm vmovdqu    ymm0, [eax] /* UV */                             \

     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \

-    __asm lea        eax, [eax + 32]                                           \

-  }

+    __asm lea        eax, [eax + 32]}

 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.

-#define READUYVY_AVX2 __asm {                                                  \

-    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \

+#define READUYVY_AVX2 \

+  __asm {                                                  \

+    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \

     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \

-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \

+    __asm vmovdqu    ymm0, [eax] /* UV */                             \

     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \

-    __asm lea        eax, [eax + 32]                                           \

-  }

+    __asm lea        eax, [eax + 32]}

 // Convert 16 pixels: 16 UV and 16 Y.

-#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \

+#define YUVTORGB_AVX2(YuvConstants) \

+  __asm {                                    \

     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\

     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\

     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\

@@ -2036,68 +2022,67 @@

     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \

     __asm vpsubw     ymm1, ymm3, ymm1                                          \

     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \

-    __asm vpsubw     ymm0, ymm3, ymm0                                          \

-    /* Step 2: Find Y contribution to 16 R,G,B values */                       \

+    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \

     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \

-    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \

-    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \

-    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \

+    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \

+    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \

+    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \

     __asm vpsraw     ymm0, ymm0, 6                                             \

     __asm vpsraw     ymm1, ymm1, 6                                             \

     __asm vpsraw     ymm2, ymm2, 6                                             \

-    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \

-    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \

-    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \

+    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \

+    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \

+    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \

 // Store 16 ARGB values.

-#define STOREARGB_AVX2 __asm {                                                 \

-    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \

+#define STOREARGB_AVX2 \

+  __asm {                                                 \

+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \

     __asm vpermq     ymm0, ymm0, 0xd8                                          \

-    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \

+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \

     __asm vpermq     ymm2, ymm2, 0xd8                                          \

-    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \

-    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \

+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \

+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \

     __asm vmovdqu    0[edx], ymm1                                              \

     __asm vmovdqu    32[edx], ymm0                                             \

-    __asm lea        edx,  [edx + 64]                                          \

-  }

+    __asm lea        edx,  [edx + 64]}

 // Store 16 RGBA values.

-#define STORERGBA_AVX2 __asm {                                                 \

-    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \

+#define STORERGBA_AVX2 \

+  __asm {                                                 \

+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \

     __asm vpermq     ymm1, ymm1, 0xd8                                          \

-    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \

+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \

     __asm vpermq     ymm2, ymm2, 0xd8                                          \

-    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \

-    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \

+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \

+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \

     __asm vmovdqu    [edx], ymm0                                               \

     __asm vmovdqu    [edx + 32], ymm1                                          \

-    __asm lea        edx,  [edx + 64]                                          \

-  }

+    __asm lea        edx,  [edx + 64]}

 #ifdef HAS_I422TOARGBROW_AVX2

 // 16 pixels

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-__declspec(naked)

-void I422ToARGBRow_AVX2(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

+__declspec(naked) void I422ToARGBRow_AVX2(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // argb

     mov        ebx, [esp + 12 + 20]  // yuvconstants

     mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUV422_AVX2

@@ -2119,21 +2104,21 @@

 #ifdef HAS_I422ALPHATOARGBROW_AVX2

 // 16 pixels

 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.

-__declspec(naked)

-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,

-                             const uint8* u_buf,

-                             const uint8* v_buf,

-                             const uint8* a_buf,

-                             uint8* dst_argb,

-                             const struct YuvConstants* yuvconstants,

-                             int width) {

+__declspec(naked) void I422AlphaToARGBRow_AVX2(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    const uint8_t* a_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

     push       ebp

-    mov        eax, [esp + 16 + 4]   // Y

-    mov        esi, [esp + 16 + 8]   // U

+    mov        eax, [esp + 16 + 4]  // Y

+    mov        esi, [esp + 16 + 8]  // U

     mov        edi, [esp + 16 + 12]  // V

     mov        ebp, [esp + 16 + 16]  // A

     mov        edx, [esp + 16 + 20]  // argb

@@ -2162,25 +2147,25 @@

 #ifdef HAS_I444TOARGBROW_AVX2

 // 16 pixels

 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).

-__declspec(naked)

-void I444ToARGBRow_AVX2(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

+__declspec(naked) void I444ToARGBRow_AVX2(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // argb

     mov        ebx, [esp + 12 + 20]  // yuvconstants

     mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUV444_AVX2

     YUVTORGB_AVX2(ebx)

@@ -2198,64 +2183,24 @@

 #endif  // HAS_I444TOARGBROW_AVX2

-#ifdef HAS_I411TOARGBROW_AVX2

-// 16 pixels

-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-__declspec(naked)

-void I411ToARGBRow_AVX2(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

-  __asm {

-    push       esi

-    push       edi

-    push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

-    mov        edi, [esp + 12 + 12]  // V

-    mov        edx, [esp + 12 + 16]  // abgr

-    mov        ebx, [esp + 12 + 20]  // yuvconstants

-    mov        ecx, [esp + 12 + 24]  // width

-    sub        edi, esi

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

- convertloop:

-    READYUV411_AVX2

-    YUVTORGB_AVX2(ebx)

-    STOREARGB_AVX2

-    sub        ecx, 16

-    jg         convertloop

-    pop        ebx

-    pop        edi

-    pop        esi

-    vzeroupper

-    ret

-  }

-}

-#endif  // HAS_I411TOARGBROW_AVX2

 #ifdef HAS_NV12TOARGBROW_AVX2

 // 16 pixels.

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-__declspec(naked)

-void NV12ToARGBRow_AVX2(const uint8* y_buf,

-                        const uint8* uv_buf,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

+__declspec(naked) void NV12ToARGBRow_AVX2(

+    const uint8_t* y_buf,

+    const uint8_t* uv_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       ebx

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // UV

+    mov        eax, [esp + 8 + 4]  // Y

+    mov        esi, [esp + 8 + 8]  // UV

     mov        edx, [esp + 8 + 12]  // argb

     mov        ebx, [esp + 8 + 16]  // yuvconstants

     mov        ecx, [esp + 8 + 20]  // width

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha

  convertloop:

     READNV12_AVX2

@@ -2276,21 +2221,21 @@

 #ifdef HAS_NV21TOARGBROW_AVX2

 // 16 pixels.

 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).

-__declspec(naked)

-void NV21ToARGBRow_AVX2(const uint8* y_buf,

-                        const uint8* vu_buf,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

+__declspec(naked) void NV21ToARGBRow_AVX2(

+    const uint8_t* y_buf,

+    const uint8_t* vu_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       ebx

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // VU

+    mov        eax, [esp + 8 + 4]  // Y

+    mov        esi, [esp + 8 + 8]  // VU

     mov        edx, [esp + 8 + 12]  // argb

     mov        ebx, [esp + 8 + 16]  // yuvconstants

     mov        ecx, [esp + 8 + 20]  // width

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha

  convertloop:

     READNV21_AVX2

@@ -2311,18 +2256,18 @@

 #ifdef HAS_YUY2TOARGBROW_AVX2

 // 16 pixels.

 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

-__declspec(naked)

-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

+__declspec(naked) void YUY2ToARGBRow_AVX2(

+    const uint8_t* src_yuy2,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       ebx

-    mov        eax, [esp + 4 + 4]   // yuy2

-    mov        edx, [esp + 4 + 8]   // argb

+    mov        eax, [esp + 4 + 4]  // yuy2

+    mov        edx, [esp + 4 + 8]  // argb

     mov        ebx, [esp + 4 + 12]  // yuvconstants

     mov        ecx, [esp + 4 + 16]  // width

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUY2_AVX2

@@ -2342,18 +2287,18 @@

 #ifdef HAS_UYVYTOARGBROW_AVX2

 // 16 pixels.

 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).

-__declspec(naked)

-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

+__declspec(naked) void UYVYToARGBRow_AVX2(

+    const uint8_t* src_uyvy,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       ebx

-    mov        eax, [esp + 4 + 4]   // uyvy

-    mov        edx, [esp + 4 + 8]   // argb

+    mov        eax, [esp + 4 + 4]  // uyvy

+    mov        edx, [esp + 4 + 8]  // argb

     mov        ebx, [esp + 4 + 12]  // yuvconstants

     mov        ecx, [esp + 4 + 16]  // width

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha

  convertloop:

     READUYVY_AVX2

@@ -2373,25 +2318,25 @@

 #ifdef HAS_I422TORGBAROW_AVX2

 // 16 pixels

 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).

-__declspec(naked)

-void I422ToRGBARow_AVX2(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* dst_argb,

-                        const struct YuvConstants* yuvconstants,

-                        int width) {

+__declspec(naked) void I422ToRGBARow_AVX2(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // abgr

     mov        ebx, [esp + 12 + 20]  // yuvconstants

     mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha

  convertloop:

     READYUV422_AVX2

@@ -2415,100 +2360,83 @@

 // Allows a conversion with half size scaling.

 // Read 8 UV from 444.

-#define READYUV444 __asm {                                                     \

+#define READYUV444 \

+  __asm {                                                     \

     __asm movq       xmm0, qword ptr [esi] /* U */                             \

     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \

     __asm lea        esi,  [esi + 8]                                           \

-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \

     __asm movq       xmm4, qword ptr [eax]                                     \

     __asm punpcklbw  xmm4, xmm4                                                \

-    __asm lea        eax, [eax + 8]                                            \

-  }

+    __asm lea        eax, [eax + 8]}

 // Read 4 UV from 422, upsample to 8 UV.

-#define READYUV422 __asm {                                                     \

-    __asm movd       xmm0, [esi]          /* U */                              \

-    __asm movd       xmm1, [esi + edi]    /* V */                              \

+#define READYUV422 \

+  __asm {                                                     \

+    __asm movd       xmm0, [esi] /* U */                              \

+    __asm movd       xmm1, [esi + edi] /* V */                              \

     __asm lea        esi,  [esi + 4]                                           \

-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \

+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \

     __asm movq       xmm4, qword ptr [eax]                                     \

     __asm punpcklbw  xmm4, xmm4                                                \

-    __asm lea        eax, [eax + 8]                                            \

-  }

+    __asm lea        eax, [eax + 8]}

 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.

-#define READYUVA422 __asm {                                                    \

-    __asm movd       xmm0, [esi]          /* U */                              \

-    __asm movd       xmm1, [esi + edi]    /* V */                              \

+#define READYUVA422 \

+  __asm {                                                    \

+    __asm movd       xmm0, [esi] /* U */                              \

+    __asm movd       xmm1, [esi + edi] /* V */                              \

     __asm lea        esi,  [esi + 4]                                           \

-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \

-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

-    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \

+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \

+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \

+    __asm movq       xmm4, qword ptr [eax] /* Y */                           \

     __asm punpcklbw  xmm4, xmm4                                                \

     __asm lea        eax, [eax + 8]                                            \

-    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \

-    __asm lea        ebp, [ebp + 8]                                            \

-  }

+    __asm movq       xmm5, qword ptr [ebp] /* A */                           \

+    __asm lea        ebp, [ebp + 8]}

-// Read 2 UV from 411, upsample to 8 UV.

-// drmemory fails with memory fault if pinsrw used. libyuv bug: 525

-//  __asm pinsrw     xmm0, [esi], 0        /* U */

-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */

-#define READYUV411_EBX __asm {                                                 \

-    __asm movzx      ebx, word ptr [esi]        /* U */                        \

-    __asm movd       xmm0, ebx                                                 \

-    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \

-    __asm movd       xmm1, ebx                                                 \

-    __asm lea        esi,  [esi + 2]                                           \

-    __asm punpcklbw  xmm0, xmm1            /* UV */                            \

-    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \

-    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \

-    __asm movq       xmm4, qword ptr [eax]                                     \

-    __asm punpcklbw  xmm4, xmm4                                                \

-    __asm lea        eax, [eax + 8]                                            \

-  }

 // Read 4 UV from NV12, upsample to 8 UV.

-#define READNV12 __asm {                                                       \

+#define READNV12 \

+  __asm {                                                       \

     __asm movq       xmm0, qword ptr [esi] /* UV */                            \

     __asm lea        esi,  [esi + 8]                                           \

-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \

+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \

     __asm movq       xmm4, qword ptr [eax]                                     \

     __asm punpcklbw  xmm4, xmm4                                                \

-    __asm lea        eax, [eax + 8]                                            \

-  }

+    __asm lea        eax, [eax + 8]}

 // Read 4 VU from NV21, upsample to 8 UV.

-#define READNV21 __asm {                                                       \

-    __asm movq       xmm0, qword ptr [esi] /* UV */                            \

+#define READNV21 \

+  __asm {                                                       \

+    __asm movq       xmm0, qword ptr [esi] /* UV */                            \

     __asm lea        esi,  [esi + 8]                                           \

     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \

     __asm movq       xmm4, qword ptr [eax]                                     \

     __asm punpcklbw  xmm4, xmm4                                                \

-    __asm lea        eax, [eax + 8]                                            \

-  }

+    __asm lea        eax, [eax + 8]}

 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.

-#define READYUY2 __asm {                                                       \

-    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \

+#define READYUY2 \

+  __asm {                                                       \

+    __asm movdqu     xmm4, [eax] /* YUY2 */                           \

     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \

-    __asm movdqu     xmm0, [eax]          /* UV */                             \

+    __asm movdqu     xmm0, [eax] /* UV */                             \

     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \

-    __asm lea        eax, [eax + 16]                                           \

-  }

+    __asm lea        eax, [eax + 16]}

 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.

-#define READUYVY __asm {                                                       \

-    __asm movdqu     xmm4, [eax]          /* UYVY */                           \

+#define READUYVY \

+  __asm {                                                       \

+    __asm movdqu     xmm4, [eax] /* UYVY */                           \

     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \

-    __asm movdqu     xmm0, [eax]          /* UV */                             \

+    __asm movdqu     xmm0, [eax] /* UV */                             \

     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \

-    __asm lea        eax, [eax + 16]                                           \

-  }

+    __asm lea        eax, [eax + 16]}

 // Convert 8 pixels: 8 UV and 8 Y.

-#define YUVTORGB(YuvConstants) __asm {                                         \

+#define YUVTORGB(YuvConstants) \

+  __asm {                                         \

     __asm movdqa     xmm1, xmm0                                                \

     __asm movdqa     xmm2, xmm0                                                \

     __asm movdqa     xmm3, xmm0                                                \

@@ -2522,129 +2450,125 @@

     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \

     __asm psubw      xmm2, xmm3                                                \

     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \

-    __asm paddsw     xmm0, xmm4           /* B += Y */                         \

-    __asm paddsw     xmm1, xmm4           /* G += Y */                         \

-    __asm paddsw     xmm2, xmm4           /* R += Y */                         \

+    __asm paddsw     xmm0, xmm4 /* B += Y */                         \

+    __asm paddsw     xmm1, xmm4 /* G += Y */                         \

+    __asm paddsw     xmm2, xmm4 /* R += Y */                         \

     __asm psraw      xmm0, 6                                                   \

     __asm psraw      xmm1, 6                                                   \

     __asm psraw      xmm2, 6                                                   \

-    __asm packuswb   xmm0, xmm0           /* B */                              \

-    __asm packuswb   xmm1, xmm1           /* G */                              \

-    __asm packuswb   xmm2, xmm2           /* R */                              \

+    __asm packuswb   xmm0, xmm0 /* B */                              \

+    __asm packuswb   xmm1, xmm1 /* G */                              \

+    __asm packuswb   xmm2, xmm2 /* R */             \

 // Store 8 ARGB values.

-#define STOREARGB __asm {                                                      \

-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \

-    __asm punpcklbw  xmm2, xmm5           /* RA */                             \

+#define STOREARGB \

+  __asm {                                                      \

+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \

+    __asm punpcklbw  xmm2, xmm5 /* RA */                             \

     __asm movdqa     xmm1, xmm0                                                \

-    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \

-    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \

+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \

+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \

     __asm movdqu     0[edx], xmm0                                              \

     __asm movdqu     16[edx], xmm1                                             \

-    __asm lea        edx,  [edx + 32]                                          \

-  }

+    __asm lea        edx,  [edx + 32]}

 // Store 8 BGRA values.

-#define STOREBGRA __asm {                                                      \

-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \

-    __asm punpcklbw  xmm1, xmm0           /* GB */                             \

-    __asm punpcklbw  xmm5, xmm2           /* AR */                             \

+#define STOREBGRA \

+  __asm {                                                      \

+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \

+    __asm punpcklbw  xmm1, xmm0 /* GB */                             \

+    __asm punpcklbw  xmm5, xmm2 /* AR */                             \

     __asm movdqa     xmm0, xmm5                                                \

-    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \

-    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \

+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \

+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \

     __asm movdqu     0[edx], xmm5                                              \

     __asm movdqu     16[edx], xmm0                                             \

-    __asm lea        edx,  [edx + 32]                                          \

-  }

+    __asm lea        edx,  [edx + 32]}

 // Store 8 RGBA values.

-#define STORERGBA __asm {                                                      \

-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \

-    __asm punpcklbw  xmm1, xmm2           /* GR */                             \

-    __asm punpcklbw  xmm5, xmm0           /* AB */                             \

+#define STORERGBA \

+  __asm {                                                      \

+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \

+    __asm punpcklbw  xmm1, xmm2 /* GR */                             \

+    __asm punpcklbw  xmm5, xmm0 /* AB */                             \

     __asm movdqa     xmm0, xmm5                                                \

-    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \

-    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \

+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \

+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \

     __asm movdqu     0[edx], xmm5                                              \

     __asm movdqu     16[edx], xmm0                                             \

-    __asm lea        edx,  [edx + 32]                                          \

-  }

+    __asm lea        edx,  [edx + 32]}

 // Store 8 RGB24 values.

-#define STORERGB24 __asm {                                                     \

-    /* Weave into RRGB */                                                      \

-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \

-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \

+#define STORERGB24 \

+  __asm {/* Weave into RRGB */                                                      \

+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \

+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \

     __asm movdqa     xmm1, xmm0                                                \

-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \

-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \

-    /* RRGB -> RGB24 */                                                        \

-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \

-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \

-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \

-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \

-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \

-    __asm lea        edx,  [edx + 24]                                          \

-  }

+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \

+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \

+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \

+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \

+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \

+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \

+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \

+    __asm lea        edx,  [edx + 24]}

 // Store 8 RGB565 values.

-#define STORERGB565 __asm {                                                    \

-    /* Weave into RRGB */                                                      \

-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \

-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \

+#define STORERGB565 \

+  __asm {/* Weave into RRGB */                                                      \

+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \

+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \

     __asm movdqa     xmm1, xmm0                                                \

-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \

-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \

-    /* RRGB -> RGB565 */                                                       \

-    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \

-    __asm movdqa     xmm2, xmm0    /* G */                                     \

-    __asm pslld      xmm0, 8       /* R */                                     \

-    __asm psrld      xmm3, 3       /* B */                                     \

-    __asm psrld      xmm2, 5       /* G */                                     \

-    __asm psrad      xmm0, 16      /* R */                                     \

-    __asm pand       xmm3, xmm5    /* B */                                     \

-    __asm pand       xmm2, xmm6    /* G */                                     \

-    __asm pand       xmm0, xmm7    /* R */                                     \

-    __asm por        xmm3, xmm2    /* BG */                                    \

-    __asm por        xmm0, xmm3    /* BGR */                                   \

-    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \

-    __asm movdqa     xmm2, xmm1    /* G */                                     \

-    __asm pslld      xmm1, 8       /* R */                                     \

-    __asm psrld      xmm3, 3       /* B */                                     \

-    __asm psrld      xmm2, 5       /* G */                                     \

-    __asm psrad      xmm1, 16      /* R */                                     \

-    __asm pand       xmm3, xmm5    /* B */                                     \

-    __asm pand       xmm2, xmm6    /* G */                                     \

-    __asm pand       xmm1, xmm7    /* R */                                     \

-    __asm por        xmm3, xmm2    /* BG */                                    \

-    __asm por        xmm1, xmm3    /* BGR */                                   \

+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \

+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \

+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \

+    __asm movdqa     xmm2, xmm0 /* G */                                     \

+    __asm pslld      xmm0, 8 /* R */                                     \

+    __asm psrld      xmm3, 3 /* B */                                     \

+    __asm psrld      xmm2, 5 /* G */                                     \

+    __asm psrad      xmm0, 16 /* R */                                     \

+    __asm pand       xmm3, xmm5 /* B */                                     \

+    __asm pand       xmm2, xmm6 /* G */                                     \

+    __asm pand       xmm0, xmm7 /* R */                                     \

+    __asm por        xmm3, xmm2 /* BG */                                    \

+    __asm por        xmm0, xmm3 /* BGR */                                   \

+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \

+    __asm movdqa     xmm2, xmm1 /* G */                                     \

+    __asm pslld      xmm1, 8 /* R */                                     \

+    __asm psrld      xmm3, 3 /* B */                                     \

+    __asm psrld      xmm2, 5 /* G */                                     \

+    __asm psrad      xmm1, 16 /* R */                                     \

+    __asm pand       xmm3, xmm5 /* B */                                     \

+    __asm pand       xmm2, xmm6 /* G */                                     \

+    __asm pand       xmm1, xmm7 /* R */                                     \

+    __asm por        xmm3, xmm2 /* BG */                                    \

+    __asm por        xmm1, xmm3 /* BGR */                                   \

     __asm packssdw   xmm0, xmm1                                                \

-    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \

-    __asm lea        edx, [edx + 16]                                           \

-  }

+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \

+    __asm lea        edx, [edx + 16]}

 // 8 pixels.

 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).

-__declspec(naked)

-void I444ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

+__declspec(naked) void I444ToARGBRow_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // argb

     mov        ebx, [esp + 12 + 20]  // yuvconstants

     mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha

+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha

  convertloop:

     READYUV444

@@ -2663,19 +2587,19 @@

 // 8 pixels.

 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).

-__declspec(naked)

-void I422ToRGB24Row_SSSE3(const uint8* y_buf,

-                          const uint8* u_buf,

-                          const uint8* v_buf,

-                          uint8* dst_rgb24,

-                          const struct YuvConstants* yuvconstants,

-                          int width) {

+__declspec(naked) void I422ToRGB24Row_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* dst_rgb24,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // argb

     mov        ebx, [esp + 12 + 20]  // yuvconstants

@@ -2701,30 +2625,30 @@

 // 8 pixels

 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).

-__declspec(naked)

-void I422ToRGB565Row_SSSE3(const uint8* y_buf,

-                           const uint8* u_buf,

-                           const uint8* v_buf,

-                           uint8* rgb565_buf,

-                           const struct YuvConstants* yuvconstants,

-                           int width) {

+__declspec(naked) void I422ToRGB565Row_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* rgb565_buf,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // argb

     mov        ebx, [esp + 12 + 20]  // yuvconstants

     mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f

+    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f

     psrld      xmm5, 27

-    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0

+    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0

     psrld      xmm6, 26

     pslld      xmm6, 5

-    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800

+    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800

     pslld      xmm7, 11

  convertloop:

@@ -2744,25 +2668,25 @@

 // 8 pixels.

 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

-__declspec(naked)

-void I422ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

+__declspec(naked) void I422ToARGBRow_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // argb

     mov        ebx, [esp + 12 + 20]  // yuvconstants

     mov        ecx, [esp + 12 + 24]  // width

     sub        edi, esi

-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha

  convertloop:

     READYUV422

@@ -2781,21 +2705,21 @@

 // 8 pixels.

 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.

-__declspec(naked)

-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,

-                              const uint8* u_buf,

-                              const uint8* v_buf,

-                              const uint8* a_buf,

-                              uint8* dst_argb,

-                              const struct YuvConstants* yuvconstants,

-                              int width) {

+__declspec(naked) void I422AlphaToARGBRow_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    const uint8_t* a_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

     push       ebp

-    mov        eax, [esp + 16 + 4]   // Y

-    mov        esi, [esp + 16 + 8]   // U

+    mov        eax, [esp + 16 + 4]  // Y

+    mov        esi, [esp + 16 + 8]  // U

     mov        edi, [esp + 16 + 12]  // V

     mov        ebp, [esp + 16 + 16]  // A

     mov        edx, [esp + 16 + 20]  // argb

@@ -2820,62 +2744,22 @@

 // 8 pixels.

-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

-// Similar to I420 but duplicate UV once more.

-__declspec(naked)

-void I411ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

-  __asm {

-    push       esi

-    push       edi

-    push       ebx

-    push       ebp

-    mov        eax, [esp + 16 + 4]   // Y

-    mov        esi, [esp + 16 + 8]   // U

-    mov        edi, [esp + 16 + 12]  // V

-    mov        edx, [esp + 16 + 16]  // abgr

-    mov        ebp, [esp + 16 + 20]  // yuvconstants

-    mov        ecx, [esp + 16 + 24]  // width

-    sub        edi, esi

-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha

- convertloop:

-    READYUV411_EBX

-    YUVTORGB(ebp)

-    STOREARGB

-    sub        ecx, 8

-    jg         convertloop

-    pop        ebp

-    pop        ebx

-    pop        edi

-    pop        esi

-    ret

-  }

-}

-// 8 pixels.

 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

-__declspec(naked)

-void NV12ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* uv_buf,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

+__declspec(naked) void NV12ToARGBRow_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* uv_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       ebx

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // UV

+    mov        eax, [esp + 8 + 4]  // Y

+    mov        esi, [esp + 8 + 8]  // UV

     mov        edx, [esp + 8 + 12]  // argb

     mov        ebx, [esp + 8 + 16]  // yuvconstants

     mov        ecx, [esp + 8 + 20]  // width

-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha

  convertloop:

     READNV12

@@ -2893,21 +2777,21 @@

 // 8 pixels.

 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).

-__declspec(naked)

-void NV21ToARGBRow_SSSE3(const uint8* y_buf,

-                         const uint8* vu_buf,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

+__declspec(naked) void NV21ToARGBRow_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* vu_buf,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       ebx

-    mov        eax, [esp + 8 + 4]   // Y

-    mov        esi, [esp + 8 + 8]   // VU

+    mov        eax, [esp + 8 + 4]  // Y

+    mov        esi, [esp + 8 + 8]  // VU

     mov        edx, [esp + 8 + 12]  // argb

     mov        ebx, [esp + 8 + 16]  // yuvconstants

     mov        ecx, [esp + 8 + 20]  // width

-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha

  convertloop:

     READNV21

@@ -2925,18 +2809,18 @@

 // 8 pixels.

 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).

-__declspec(naked)

-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

+__declspec(naked) void YUY2ToARGBRow_SSSE3(

+    const uint8_t* src_yuy2,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       ebx

-    mov        eax, [esp + 4 + 4]   // yuy2

-    mov        edx, [esp + 4 + 8]   // argb

+    mov        eax, [esp + 4 + 4]  // yuy2

+    mov        edx, [esp + 4 + 8]  // argb

     mov        ebx, [esp + 4 + 12]  // yuvconstants

     mov        ecx, [esp + 4 + 16]  // width

-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha

  convertloop:

     READYUY2

@@ -2953,18 +2837,18 @@

 // 8 pixels.

 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).

-__declspec(naked)

-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,

-                         uint8* dst_argb,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

+__declspec(naked) void UYVYToARGBRow_SSSE3(

+    const uint8_t* src_uyvy,

+    uint8_t* dst_argb,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       ebx

-    mov        eax, [esp + 4 + 4]   // uyvy

-    mov        edx, [esp + 4 + 8]   // argb

+    mov        eax, [esp + 4 + 4]  // uyvy

+    mov        edx, [esp + 4 + 8]  // argb

     mov        ebx, [esp + 4 + 12]  // yuvconstants

     mov        ecx, [esp + 4 + 16]  // width

-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha

+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha

  convertloop:

     READUYVY

@@ -2979,19 +2863,19 @@

-__declspec(naked)

-void I422ToRGBARow_SSSE3(const uint8* y_buf,

-                         const uint8* u_buf,

-                         const uint8* v_buf,

-                         uint8* dst_rgba,

-                         const struct YuvConstants* yuvconstants,

-                         int width) {

+__declspec(naked) void I422ToRGBARow_SSSE3(

+    const uint8_t* y_buf,

+    const uint8_t* u_buf,

+    const uint8_t* v_buf,

+    uint8_t* dst_rgba,

+    const struct YuvConstants* yuvconstants,

+    int width) {

   __asm {

     push       esi

     push       edi

     push       ebx

-    mov        eax, [esp + 12 + 4]   // Y

-    mov        esi, [esp + 12 + 8]   // U

+    mov        eax, [esp + 12 + 4]  // Y

+    mov        esi, [esp + 12 + 8]  // U

     mov        edi, [esp + 12 + 12]  // V

     mov        edx, [esp + 12 + 16]  // argb

     mov        ebx, [esp + 12 + 20]  // yuvconstants

@@ -3016,39 +2900,38 @@

 #ifdef HAS_I400TOARGBROW_SSE2

 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).

-__declspec(naked)

-void I400ToARGBRow_SSE2(const uint8* y_buf,

-                        uint8* rgb_buf,

-                        int width) {

+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,

+                                          uint8_t* rgb_buf,

+                                          int width) {

   __asm {

-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)

+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)

     movd       xmm2, eax

     pshufd     xmm2, xmm2,0

-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)

+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)

     movd       xmm3, eax

     pshufd     xmm3, xmm3, 0

-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000

+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000

     pslld      xmm4, 24

-    mov        eax, [esp + 4]       // Y

-    mov        edx, [esp + 8]       // rgb

-    mov        ecx, [esp + 12]      // width

+    mov        eax, [esp + 4]  // Y

+    mov        edx, [esp + 8]  // rgb

+    mov        ecx, [esp + 12]  // width

  convertloop:

-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164

+        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164

     movq       xmm0, qword ptr [eax]

     lea        eax, [eax + 8]

-    punpcklbw  xmm0, xmm0           // Y.Y

+    punpcklbw  xmm0, xmm0  // Y.Y

     pmulhuw    xmm0, xmm2

     psubusw    xmm0, xmm3

     psrlw      xmm0, 6

-    packuswb   xmm0, xmm0           // G

+    packuswb   xmm0, xmm0        // G

-    // Step 2: Weave into ARGB

-    punpcklbw  xmm0, xmm0           // GG

+        // Step 2: Weave into ARGB

+    punpcklbw  xmm0, xmm0  // GG

     movdqa     xmm1, xmm0

-    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels

-    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels

+    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels

+    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels

     por        xmm0, xmm4

     por        xmm1, xmm4

     movdqu     [edx], xmm0

@@ -3064,41 +2947,40 @@

 #ifdef HAS_I400TOARGBROW_AVX2

 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).

 // note: vpunpcklbw mutates and vpackuswb unmutates.

-__declspec(naked)

-void I400ToARGBRow_AVX2(const uint8* y_buf,

-                        uint8* rgb_buf,

-                        int width) {

+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,

+                                          uint8_t* rgb_buf,

+                                          int width) {

   __asm {

-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)

+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)

     vmovd      xmm2, eax

     vbroadcastss ymm2, xmm2

-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)

+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)

     vmovd      xmm3, eax

     vbroadcastss ymm3, xmm3

-    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000

+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000

     vpslld     ymm4, ymm4, 24

-    mov        eax, [esp + 4]       // Y

-    mov        edx, [esp + 8]       // rgb

-    mov        ecx, [esp + 12]      // width

+    mov        eax, [esp + 4]  // Y

+    mov        edx, [esp + 8]  // rgb

+    mov        ecx, [esp + 12]  // width

  convertloop:

-    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164

+        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164

     vmovdqu    xmm0, [eax]

     lea        eax, [eax + 16]

-    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates

-    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y

+    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates

+    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y

     vpmulhuw   ymm0, ymm0, ymm2

     vpsubusw   ymm0, ymm0, ymm3

     vpsrlw     ymm0, ymm0, 6

-    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120

+    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120

-    // TODO(fbarchard): Weave alpha with unpack.

-    // Step 2: Weave into ARGB

-    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates

+        // TODO(fbarchard): Weave alpha with unpack.

+        // Step 2: Weave into ARGB

+    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates

     vpermq     ymm1, ymm1, 0xd8

-    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels

-    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels

+    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels

+    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels

     vpor       ymm0, ymm0, ymm4

     vpor       ymm1, ymm1, ymm4

     vmovdqu    [edx], ymm0

@@ -3114,16 +2996,16 @@

 #ifdef HAS_MIRRORROW_SSSE3

 // Shuffle table for reversing the bytes.

-static const uvec8 kShuffleMirror = {

-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

-};

+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,

+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};

 // TODO(fbarchard): Replace lea with -16 offset.

-__declspec(naked)

-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,

+                                       uint8_t* dst,

+                                       int width) {

   __asm {

-    mov       eax, [esp + 4]   // src

-    mov       edx, [esp + 8]   // dst

+    mov       eax, [esp + 4]  // src

+    mov       edx, [esp + 8]  // dst

     mov       ecx, [esp + 12]  // width

     movdqa    xmm5, xmmword ptr kShuffleMirror

@@ -3140,11 +3022,12 @@

 #endif  // HAS_MIRRORROW_SSSE3

 #ifdef HAS_MIRRORROW_AVX2

-__declspec(naked)

-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,

+                                      uint8_t* dst,

+                                      int width) {

   __asm {

-    mov       eax, [esp + 4]   // src

-    mov       edx, [esp + 8]   // dst

+    mov       eax, [esp + 4]  // src

+    mov       edx, [esp + 8]  // dst

     mov       ecx, [esp + 12]  // width

     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror

@@ -3164,17 +3047,17 @@

 #ifdef HAS_MIRRORUVROW_SSSE3

 // Shuffle table for reversing the bytes of UV channels.

-static const uvec8 kShuffleMirrorUV = {

-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u

-};

+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,

+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};

-__declspec(naked)

-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,

-                       int width) {

+__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,

+                                         uint8_t* dst_u,

+                                         uint8_t* dst_v,

+                                         int width) {

   __asm {

     push      edi

-    mov       eax, [esp + 4 + 4]   // src

-    mov       edx, [esp + 4 + 8]   // dst_u

+    mov       eax, [esp + 4 + 4]  // src

+    mov       edx, [esp + 4 + 8]  // dst_u

     mov       edi, [esp + 4 + 12]  // dst_v

     mov       ecx, [esp + 4 + 16]  // width

     movdqa    xmm1, xmmword ptr kShuffleMirrorUV

@@ -3198,11 +3081,12 @@

 #endif  // HAS_MIRRORUVROW_SSSE3

 #ifdef HAS_ARGBMIRRORROW_SSE2

-__declspec(naked)

-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,

+                                          uint8_t* dst,

+                                          int width) {

   __asm {

-    mov       eax, [esp + 4]   // src

-    mov       edx, [esp + 8]   // dst

+    mov       eax, [esp + 4]  // src

+    mov       edx, [esp + 8]  // dst

     mov       ecx, [esp + 12]  // width

     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.

@@ -3221,15 +3105,14 @@

 #ifdef HAS_ARGBMIRRORROW_AVX2

 // Shuffle table for reversing the bytes.

-static const ulvec32 kARGBShuffleMirror_AVX2 = {

-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u

-};

+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};

-__declspec(naked)

-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,

+                                          uint8_t* dst,

+                                          int width) {

   __asm {

-    mov       eax, [esp + 4]   // src

-    mov       edx, [esp + 8]   // dst

+    mov       eax, [esp + 4]  // src

+    mov       edx, [esp + 8]  // dst

     mov       ecx, [esp + 12]  // width

     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2

@@ -3246,16 +3129,17 @@

 #endif  // HAS_ARGBMIRRORROW_AVX2

 #ifdef HAS_SPLITUVROW_SSE2

-__declspec(naked)

-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                     int width) {

+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,

+                                       uint8_t* dst_u,

+                                       uint8_t* dst_v,

+                                       int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_uv

-    mov        edx, [esp + 4 + 8]    // dst_u

-    mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    mov        eax, [esp + 4 + 4]  // src_uv

+    mov        edx, [esp + 4 + 8]  // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -3265,10 +3149,10 @@

     lea        eax,  [eax + 32]

     movdqa     xmm2, xmm0

     movdqa     xmm3, xmm1

-    pand       xmm0, xmm5   // even bytes

+    pand       xmm0, xmm5  // even bytes

     pand       xmm1, xmm5

     packuswb   xmm0, xmm1

-    psrlw      xmm2, 8      // odd bytes

+    psrlw      xmm2, 8  // odd bytes

     psrlw      xmm3, 8

     packuswb   xmm2, xmm3

     movdqu     [edx], xmm0

@@ -3285,16 +3169,17 @@

 #endif  // HAS_SPLITUVROW_SSE2

 #ifdef HAS_SPLITUVROW_AVX2

-__declspec(naked)

-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,

-                     int width) {

+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,

+                                       uint8_t* dst_u,

+                                       uint8_t* dst_v,

+                                       int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_uv

-    mov        edx, [esp + 4 + 8]    // dst_u

-    mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // width

-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    mov        eax, [esp + 4 + 4]  // src_uv

+    mov        edx, [esp + 4 + 8]  // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // width

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3302,9 +3187,9 @@

     vmovdqu    ymm0, [eax]

     vmovdqu    ymm1, [eax + 32]

     lea        eax,  [eax + 64]

-    vpsrlw     ymm2, ymm0, 8      // odd bytes

+    vpsrlw     ymm2, ymm0, 8  // odd bytes

     vpsrlw     ymm3, ymm1, 8

-    vpand      ymm0, ymm0, ymm5   // even bytes

+    vpand      ymm0, ymm0, ymm5  // even bytes

     vpand      ymm1, ymm1, ymm5

     vpackuswb  ymm0, ymm0, ymm1

     vpackuswb  ymm2, ymm2, ymm3

@@ -3324,24 +3209,25 @@

 #endif  // HAS_SPLITUVROW_AVX2

 #ifdef HAS_MERGEUVROW_SSE2

-__declspec(naked)

-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

-                     int width) {

+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,

+                                       const uint8_t* src_v,

+                                       uint8_t* dst_uv,

+                                       int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_u

-    mov        edx, [esp + 4 + 8]    // src_v

-    mov        edi, [esp + 4 + 12]   // dst_uv

-    mov        ecx, [esp + 4 + 16]   // width

+    mov        eax, [esp + 4 + 4]  // src_u

+    mov        edx, [esp + 4 + 8]  // src_v

+    mov        edi, [esp + 4 + 12]  // dst_uv

+    mov        ecx, [esp + 4 + 16]  // width

     sub        edx, eax

   convertloop:

-    movdqu     xmm0, [eax]      // read 16 U's

+    movdqu     xmm0, [eax]  // read 16 U's

     movdqu     xmm1, [eax + edx]  // and 16 V's

     lea        eax,  [eax + 16]

     movdqa     xmm2, xmm0

-    punpcklbw  xmm0, xmm1       // first 8 UV pairs

-    punpckhbw  xmm2, xmm1       // next 8 UV pairs

+    punpcklbw  xmm0, xmm1  // first 8 UV pairs

+    punpckhbw  xmm2, xmm1  // next 8 UV pairs

     movdqu     [edi], xmm0

     movdqu     [edi + 16], xmm2

     lea        edi, [edi + 32]

@@ -3355,24 +3241,25 @@

 #endif  //  HAS_MERGEUVROW_SSE2

 #ifdef HAS_MERGEUVROW_AVX2

-__declspec(naked)

-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,

-                     int width) {

+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,

+                                       const uint8_t* src_v,

+                                       uint8_t* dst_uv,

+                                       int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_u

-    mov        edx, [esp + 4 + 8]    // src_v

-    mov        edi, [esp + 4 + 12]   // dst_uv

-    mov        ecx, [esp + 4 + 16]   // width

+    mov        eax, [esp + 4 + 4]  // src_u

+    mov        edx, [esp + 4 + 8]  // src_v

+    mov        edi, [esp + 4 + 12]  // dst_uv

+    mov        ecx, [esp + 4 + 16]  // width

     sub        edx, eax

   convertloop:

-    vmovdqu    ymm0, [eax]           // read 32 U's

-    vmovdqu    ymm1, [eax + edx]     // and 32 V's

+    vmovdqu    ymm0, [eax]  // read 32 U's

+    vmovdqu    ymm1, [eax + edx]  // and 32 V's

     lea        eax,  [eax + 32]

-    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2

-    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3

-    vextractf128 [edi], ymm2, 0       // bytes 0..15

+    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2

+    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3

+    vextractf128 [edi], ymm2, 0  // bytes 0..15

     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31

     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47

     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63

@@ -3388,13 +3275,14 @@

 #endif  //  HAS_MERGEUVROW_AVX2

 #ifdef HAS_COPYROW_SSE2

-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.

-__declspec(naked)

-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {

+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.

+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,

+                                    uint8_t* dst,

+                                    int width) {

   __asm {

-    mov        eax, [esp + 4]   // src

-    mov        edx, [esp + 8]   // dst

-    mov        ecx, [esp + 12]  // count

+    mov        eax, [esp + 4]  // src

+    mov        edx, [esp + 8]  // dst

+    mov        ecx, [esp + 12]  // width

     test       eax, 15

     jne        convertloopu

     test       edx, 15

@@ -3426,13 +3314,14 @@

 #endif  // HAS_COPYROW_SSE2

 #ifdef HAS_COPYROW_AVX

-// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.

-__declspec(naked)

-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {

+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.

+__declspec(naked) void CopyRow_AVX(const uint8_t* src,

+                                   uint8_t* dst,

+                                   int width) {

   __asm {

-    mov        eax, [esp + 4]   // src

-    mov        edx, [esp + 8]   // dst

-    mov        ecx, [esp + 12]  // count

+    mov        eax, [esp + 4]  // src

+    mov        edx, [esp + 8]  // dst

+    mov        ecx, [esp + 12]  // width

   convertloop:

     vmovdqu    ymm0, [eax]

@@ -3451,14 +3340,15 @@

 #endif  // HAS_COPYROW_AVX

 // Multiple of 1.

-__declspec(naked)

-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {

+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,

+                                    uint8_t* dst,

+                                    int width) {

   __asm {

     mov        eax, esi

     mov        edx, edi

-    mov        esi, [esp + 4]   // src

-    mov        edi, [esp + 8]   // dst

-    mov        ecx, [esp + 12]  // count

+    mov        esi, [esp + 4]  // src

+    mov        edi, [esp + 8]  // dst

+    mov        ecx, [esp + 12]  // width

     rep movsb

     mov        edi, edx

     mov        esi, eax

@@ -3468,15 +3358,16 @@

 #ifdef HAS_ARGBCOPYALPHAROW_SSE2

 // width in pixels

-__declspec(naked)

-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,

+                                             uint8_t* dst,

+                                             int width) {

   __asm {

-    mov        eax, [esp + 4]   // src

-    mov        edx, [esp + 8]   // dst

-    mov        ecx, [esp + 12]  // count

-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000

+    mov        eax, [esp + 4]  // src

+    mov        edx, [esp + 8]  // dst

+    mov        ecx, [esp + 12]  // width

+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000

     pslld      xmm0, 24

-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff

+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff

     psrld      xmm1, 8

   convertloop:

@@ -3504,14 +3395,15 @@

 #ifdef HAS_ARGBCOPYALPHAROW_AVX2

 // width in pixels

-__declspec(naked)

-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,

+                                             uint8_t* dst,

+                                             int width) {

   __asm {

-    mov        eax, [esp + 4]   // src

-    mov        edx, [esp + 8]   // dst

-    mov        ecx, [esp + 12]  // count

+    mov        eax, [esp + 4]  // src

+    mov        edx, [esp + 8]  // dst

+    mov        ecx, [esp + 12]  // width

     vpcmpeqb   ymm0, ymm0, ymm0

-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff

+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff

   convertloop:

     vmovdqu    ymm1, [eax]

@@ -3533,11 +3425,12 @@

 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2

 // width in pixels

-__declspec(naked)

-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {

+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,

+                                                uint8_t* dst_a,

+                                                int width) {

   __asm {

-    mov        eax, [esp + 4]   // src_argb

-    mov        edx, [esp + 8]   // dst_a

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_a

     mov        ecx, [esp + 12]  // width

   extractloop:

@@ -3558,17 +3451,54 @@

 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2

+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2

+// width in pixels

+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,

+                                                uint8_t* dst_a,

+                                                int width) {

+  __asm {

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_a

+    mov        ecx, [esp + 12]  // width

+    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX

+  extractloop:

+    vmovdqu    ymm0, [eax]

+    vmovdqu    ymm1, [eax + 32]

+    vpsrld     ymm0, ymm0, 24

+    vpsrld     ymm1, ymm1, 24

+    vmovdqu    ymm2, [eax + 64]

+    vmovdqu    ymm3, [eax + 96]

+    lea        eax, [eax + 128]

+    vpackssdw  ymm0, ymm0, ymm1  // mutates

+    vpsrld     ymm2, ymm2, 24

+    vpsrld     ymm3, ymm3, 24

+    vpackssdw  ymm2, ymm2, ymm3  // mutates

+    vpackuswb  ymm0, ymm0, ymm2  // mutates

+    vpermd     ymm0, ymm4, ymm0  // unmutate

+    vmovdqu    [edx], ymm0

+    lea        edx, [edx + 32]

+    sub        ecx, 32

+    jg         extractloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2

 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2

 // width in pixels

-__declspec(naked)

-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,

+                                                uint8_t* dst,

+                                                int width) {

   __asm {

-    mov        eax, [esp + 4]   // src

-    mov        edx, [esp + 8]   // dst

-    mov        ecx, [esp + 12]  // count

-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000

+    mov        eax, [esp + 4]  // src

+    mov        edx, [esp + 8]  // dst

+    mov        ecx, [esp + 12]  // width

+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000

     pslld      xmm0, 24

-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff

+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff

     psrld      xmm1, 8

   convertloop:

@@ -3598,14 +3528,15 @@

 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2

 // width in pixels

-__declspec(naked)

-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {

+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,

+                                                uint8_t* dst,

+                                                int width) {

   __asm {

-    mov        eax, [esp + 4]   // src

-    mov        edx, [esp + 8]   // dst

-    mov        ecx, [esp + 12]  // count

+    mov        eax, [esp + 4]  // src

+    mov        edx, [esp + 8]  // dst

+    mov        ecx, [esp + 12]  // width

     vpcmpeqb   ymm0, ymm0, ymm0

-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff

+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff

   convertloop:

     vpmovzxbd  ymm1, qword ptr [eax]

@@ -3628,17 +3559,16 @@

 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2

 #ifdef HAS_SETROW_X86

-// Write 'count' bytes using an 8 bit value repeated.

-// Count should be multiple of 4.

-__declspec(naked)

-void SetRow_X86(uint8* dst, uint8 v8, int count) {

+// Write 'width' bytes using an 8 bit value repeated.

+// width should be multiple of 4.

+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {

   __asm {

-    movzx      eax, byte ptr [esp + 8]    // v8

+    movzx      eax, byte ptr [esp + 8]  // v8

     mov        edx, 0x01010101  // Duplicate byte to all bytes.

-    mul        edx              // overwrites edx with upper part of result.

+    mul        edx  // overwrites edx with upper part of result.

     mov        edx, edi

-    mov        edi, [esp + 4]   // dst

-    mov        ecx, [esp + 12]  // count

+    mov        edi, [esp + 4]  // dst

+    mov        ecx, [esp + 12]  // width

     shr        ecx, 2

     rep stosd

     mov        edi, edx

@@ -3646,14 +3576,13 @@

-// Write 'count' bytes using an 8 bit value repeated.

-__declspec(naked)

-void SetRow_ERMS(uint8* dst, uint8 v8, int count) {

+// Write 'width' bytes using an 8 bit value repeated.

+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {

   __asm {

     mov        edx, edi

-    mov        edi, [esp + 4]   // dst

-    mov        eax, [esp + 8]   // v8

-    mov        ecx, [esp + 12]  // count

+    mov        edi, [esp + 4]  // dst

+    mov        eax, [esp + 8]  // v8

+    mov        ecx, [esp + 12]  // width

     rep stosb

     mov        edi, edx

ret

@@ -3660,14 +3589,15 @@

-// Write 'count' 32 bit values.

-__declspec(naked)

-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {

+// Write 'width' 32 bit values.

+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,

+                                      uint32_t v32,

+                                      int width) {

   __asm {

     mov        edx, edi

-    mov        edi, [esp + 4]   // dst

-    mov        eax, [esp + 8]   // v32

-    mov        ecx, [esp + 12]  // count

+    mov        edi, [esp + 4]  // dst

+    mov        eax, [esp + 8]  // v32

+    mov        ecx, [esp + 12]  // width

     rep stosd

     mov        edi, edx

ret

@@ -3676,12 +3606,13 @@

 #endif  // HAS_SETROW_X86

 #ifdef HAS_YUY2TOYROW_AVX2

-__declspec(naked)

-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {

+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,

+                                       uint8_t* dst_y,

+                                       int width) {

   __asm {

-    mov        eax, [esp + 4]    // src_yuy2

-    mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // width

+    mov        eax, [esp + 4]  // src_yuy2

+    mov        edx, [esp + 8]  // dst_y

+    mov        ecx, [esp + 12]  // width

     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

@@ -3689,9 +3620,9 @@

     vmovdqu    ymm0, [eax]

     vmovdqu    ymm1, [eax + 32]

     lea        eax,  [eax + 64]

-    vpand      ymm0, ymm0, ymm5   // even bytes are Y

+    vpand      ymm0, ymm0, ymm5  // even bytes are Y

     vpand      ymm1, ymm1, ymm5

-    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpackuswb  ymm0, ymm0, ymm1  // mutates.

     vpermq     ymm0, ymm0, 0xd8

     vmovdqu    [edx], ymm0

     lea        edx, [edx + 32]

@@ -3702,18 +3633,20 @@

-__declspec(naked)

-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,

+                                        int stride_yuy2,

+                                        uint8_t* dst_u,

+                                        uint8_t* dst_v,

+                                        int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_yuy2

-    mov        esi, [esp + 8 + 8]    // stride_yuy2

-    mov        edx, [esp + 8 + 12]   // dst_u

-    mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // width

-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    mov        eax, [esp + 8 + 4]  // src_yuy2

+    mov        esi, [esp + 8 + 8]  // stride_yuy2

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // width

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3723,18 +3656,18 @@

     vpavgb     ymm0, ymm0, [eax + esi]

     vpavgb     ymm1, ymm1, [eax + esi + 32]

     lea        eax,  [eax + 64]

-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV

+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV

     vpsrlw     ymm1, ymm1, 8

-    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpackuswb  ymm0, ymm0, ymm1  // mutates.

     vpermq     ymm0, ymm0, 0xd8

     vpand      ymm1, ymm0, ymm5  // U

-    vpsrlw     ymm0, ymm0, 8     // V

+    vpsrlw     ymm0, ymm0, 8  // V

     vpackuswb  ymm1, ymm1, ymm1  // mutates.

     vpackuswb  ymm0, ymm0, ymm0  // mutates.

     vpermq     ymm1, ymm1, 0xd8

     vpermq     ymm0, ymm0, 0xd8

     vextractf128 [edx], ymm1, 0  // U

-    vextractf128 [edx + edi], ymm0, 0 // V

+    vextractf128 [edx + edi], ymm0, 0  // V

     lea        edx, [edx + 16]

     sub        ecx, 32

     jg         convertloop

@@ -3746,16 +3679,17 @@

-__declspec(naked)

-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,

+                                           uint8_t* dst_u,

+                                           uint8_t* dst_v,

+                                           int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_yuy2

-    mov        edx, [esp + 4 + 8]    // dst_u

-    mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // width

-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    mov        eax, [esp + 4 + 4]  // src_yuy2

+    mov        edx, [esp + 4 + 8]  // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // width

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3763,18 +3697,18 @@

     vmovdqu    ymm0, [eax]

     vmovdqu    ymm1, [eax + 32]

     lea        eax,  [eax + 64]

-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV

+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV

     vpsrlw     ymm1, ymm1, 8

-    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpackuswb  ymm0, ymm0, ymm1  // mutates.

     vpermq     ymm0, ymm0, 0xd8

     vpand      ymm1, ymm0, ymm5  // U

-    vpsrlw     ymm0, ymm0, 8     // V

+    vpsrlw     ymm0, ymm0, 8  // V

     vpackuswb  ymm1, ymm1, ymm1  // mutates.

     vpackuswb  ymm0, ymm0, ymm0  // mutates.

     vpermq     ymm1, ymm1, 0xd8

     vpermq     ymm0, ymm0, 0xd8

     vextractf128 [edx], ymm1, 0  // U

-    vextractf128 [edx + edi], ymm0, 0 // V

+    vextractf128 [edx + edi], ymm0, 0  // V

     lea        edx, [edx + 16]

     sub        ecx, 32

     jg         convertloop

@@ -3785,21 +3719,21 @@

-__declspec(naked)

-void UYVYToYRow_AVX2(const uint8* src_uyvy,

-                     uint8* dst_y, int width) {

+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,

+                                       uint8_t* dst_y,

+                                       int width) {

   __asm {

-    mov        eax, [esp + 4]    // src_uyvy

-    mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // width

+    mov        eax, [esp + 4]  // src_uyvy

+    mov        edx, [esp + 8]  // dst_y

+    mov        ecx, [esp + 12]  // width

   convertloop:

     vmovdqu    ymm0, [eax]

     vmovdqu    ymm1, [eax + 32]

     lea        eax,  [eax + 64]

-    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y

+    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y

     vpsrlw     ymm1, ymm1, 8

-    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpackuswb  ymm0, ymm0, ymm1  // mutates.

     vpermq     ymm0, ymm0, 0xd8

     vmovdqu    [edx], ymm0

     lea        edx, [edx + 32]

@@ -3810,18 +3744,20 @@

-__declspec(naked)

-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,

+                                        int stride_uyvy,

+                                        uint8_t* dst_u,

+                                        uint8_t* dst_v,

+                                        int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_yuy2

-    mov        esi, [esp + 8 + 8]    // stride_yuy2

-    mov        edx, [esp + 8 + 12]   // dst_u

-    mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // width

-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    mov        eax, [esp + 8 + 4]  // src_yuy2

+    mov        esi, [esp + 8 + 8]  // stride_yuy2

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // width

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3831,18 +3767,18 @@

     vpavgb     ymm0, ymm0, [eax + esi]

     vpavgb     ymm1, ymm1, [eax + esi + 32]

     lea        eax,  [eax + 64]

-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV

+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV

     vpand      ymm1, ymm1, ymm5

-    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpackuswb  ymm0, ymm0, ymm1  // mutates.

     vpermq     ymm0, ymm0, 0xd8

     vpand      ymm1, ymm0, ymm5  // U

-    vpsrlw     ymm0, ymm0, 8     // V

+    vpsrlw     ymm0, ymm0, 8  // V

     vpackuswb  ymm1, ymm1, ymm1  // mutates.

     vpackuswb  ymm0, ymm0, ymm0  // mutates.

     vpermq     ymm1, ymm1, 0xd8

     vpermq     ymm0, ymm0, 0xd8

     vextractf128 [edx], ymm1, 0  // U

-    vextractf128 [edx + edi], ymm0, 0 // V

+    vextractf128 [edx + edi], ymm0, 0  // V

     lea        edx, [edx + 16]

     sub        ecx, 32

     jg         convertloop

@@ -3854,16 +3790,17 @@

-__declspec(naked)

-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,

+                                           uint8_t* dst_u,

+                                           uint8_t* dst_v,

+                                           int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_yuy2

-    mov        edx, [esp + 4 + 8]    // dst_u

-    mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // width

-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff

+    mov        eax, [esp + 4 + 4]  // src_yuy2

+    mov        edx, [esp + 4 + 8]  // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // width

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff

     vpsrlw     ymm5, ymm5, 8

     sub        edi, edx

@@ -3871,18 +3808,18 @@

     vmovdqu    ymm0, [eax]

     vmovdqu    ymm1, [eax + 32]

     lea        eax,  [eax + 64]

-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV

+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV

     vpand      ymm1, ymm1, ymm5

-    vpackuswb  ymm0, ymm0, ymm1   // mutates.

+    vpackuswb  ymm0, ymm0, ymm1  // mutates.

     vpermq     ymm0, ymm0, 0xd8

     vpand      ymm1, ymm0, ymm5  // U

-    vpsrlw     ymm0, ymm0, 8     // V

+    vpsrlw     ymm0, ymm0, 8  // V

     vpackuswb  ymm1, ymm1, ymm1  // mutates.

     vpackuswb  ymm0, ymm0, ymm0  // mutates.

     vpermq     ymm1, ymm1, 0xd8

     vpermq     ymm0, ymm0, 0xd8

     vextractf128 [edx], ymm1, 0  // U

-    vextractf128 [edx + edi], ymm0, 0 // V

+    vextractf128 [edx + edi], ymm0, 0  // V

     lea        edx, [edx + 16]

     sub        ecx, 32

     jg         convertloop

@@ -3895,14 +3832,14 @@

 #endif  // HAS_YUY2TOYROW_AVX2

 #ifdef HAS_YUY2TOYROW_SSE2

-__declspec(naked)

-void YUY2ToYRow_SSE2(const uint8* src_yuy2,

-                     uint8* dst_y, int width) {

+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,

+                                       uint8_t* dst_y,

+                                       int width) {

   __asm {

-    mov        eax, [esp + 4]    // src_yuy2

-    mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // width

-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff

+    mov        eax, [esp + 4]  // src_yuy2

+    mov        edx, [esp + 8]  // dst_y

+    mov        ecx, [esp + 12]  // width

+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff

     psrlw      xmm5, 8

   convertloop:

@@ -3909,7 +3846,7 @@

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

-    pand       xmm0, xmm5   // even bytes are Y

+    pand       xmm0, xmm5  // even bytes are Y

     pand       xmm1, xmm5

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

@@ -3920,18 +3857,20 @@

-__declspec(naked)

-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,

-                      uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,

+                                        int stride_yuy2,

+                                        uint8_t* dst_u,

+                                        uint8_t* dst_v,

+                                        int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_yuy2

-    mov        esi, [esp + 8 + 8]    // stride_yuy2

-    mov        edx, [esp + 8 + 12]   // dst_u

-    mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    mov        eax, [esp + 8 + 4]  // src_yuy2

+    mov        esi, [esp + 8 + 8]  // stride_yuy2

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // width

+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -3943,13 +3882,13 @@

     lea        eax,  [eax + 32]

     pavgb      xmm0, xmm2

     pavgb      xmm1, xmm3

-    psrlw      xmm0, 8      // YUYV -> UVUV

+    psrlw      xmm0, 8  // YUYV -> UVUV

     psrlw      xmm1, 8

     packuswb   xmm0, xmm1

     movdqa     xmm1, xmm0

     pand       xmm0, xmm5  // U

     packuswb   xmm0, xmm0

-    psrlw      xmm1, 8     // V

+    psrlw      xmm1, 8  // V

     packuswb   xmm1, xmm1

     movq       qword ptr [edx], xmm0

     movq       qword ptr [edx + edi], xmm1

@@ -3963,16 +3902,17 @@

-__declspec(naked)

-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,

-                         uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,

+                                           uint8_t* dst_u,

+                                           uint8_t* dst_v,

+                                           int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_yuy2

-    mov        edx, [esp + 4 + 8]    // dst_u

-    mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    mov        eax, [esp + 4 + 4]  // src_yuy2

+    mov        edx, [esp + 4 + 8]  // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -3980,13 +3920,13 @@

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

-    psrlw      xmm0, 8      // YUYV -> UVUV

+    psrlw      xmm0, 8  // YUYV -> UVUV

     psrlw      xmm1, 8

     packuswb   xmm0, xmm1

     movdqa     xmm1, xmm0

     pand       xmm0, xmm5  // U

     packuswb   xmm0, xmm0

-    psrlw      xmm1, 8     // V

+    psrlw      xmm1, 8  // V

     packuswb   xmm1, xmm1

     movq       qword ptr [edx], xmm0

     movq       qword ptr [edx + edi], xmm1

@@ -3999,19 +3939,19 @@

-__declspec(naked)

-void UYVYToYRow_SSE2(const uint8* src_uyvy,

-                     uint8* dst_y, int width) {

+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,

+                                       uint8_t* dst_y,

+                                       int width) {

   __asm {

-    mov        eax, [esp + 4]    // src_uyvy

-    mov        edx, [esp + 8]    // dst_y

-    mov        ecx, [esp + 12]   // width

+    mov        eax, [esp + 4]  // src_uyvy

+    mov        edx, [esp + 8]  // dst_y

+    mov        ecx, [esp + 12]  // width

   convertloop:

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

-    psrlw      xmm0, 8    // odd bytes are Y

+    psrlw      xmm0, 8  // odd bytes are Y

     psrlw      xmm1, 8

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

@@ -4022,18 +3962,20 @@

-__declspec(naked)

-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,

-                      uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,

+                                        int stride_uyvy,

+                                        uint8_t* dst_u,

+                                        uint8_t* dst_v,

+                                        int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_yuy2

-    mov        esi, [esp + 8 + 8]    // stride_yuy2

-    mov        edx, [esp + 8 + 12]   // dst_u

-    mov        edi, [esp + 8 + 16]   // dst_v

-    mov        ecx, [esp + 8 + 20]   // width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    mov        eax, [esp + 8 + 4]  // src_yuy2

+    mov        esi, [esp + 8 + 8]  // stride_yuy2

+    mov        edx, [esp + 8 + 12]  // dst_u

+    mov        edi, [esp + 8 + 16]  // dst_v

+    mov        ecx, [esp + 8 + 20]  // width

+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -4045,13 +3987,13 @@

     lea        eax,  [eax + 32]

     pavgb      xmm0, xmm2

     pavgb      xmm1, xmm3

-    pand       xmm0, xmm5   // UYVY -> UVUV

+    pand       xmm0, xmm5  // UYVY -> UVUV

     pand       xmm1, xmm5

     packuswb   xmm0, xmm1

     movdqa     xmm1, xmm0

     pand       xmm0, xmm5  // U

     packuswb   xmm0, xmm0

-    psrlw      xmm1, 8     // V

+    psrlw      xmm1, 8  // V

     packuswb   xmm1, xmm1

     movq       qword ptr [edx], xmm0

     movq       qword ptr [edx + edi], xmm1

@@ -4065,16 +4007,17 @@

-__declspec(naked)

-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,

-                         uint8* dst_u, uint8* dst_v, int width) {

+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,

+                                           uint8_t* dst_u,

+                                           uint8_t* dst_v,

+                                           int width) {

   __asm {

     push       edi

-    mov        eax, [esp + 4 + 4]    // src_yuy2

-    mov        edx, [esp + 4 + 8]    // dst_u

-    mov        edi, [esp + 4 + 12]   // dst_v

-    mov        ecx, [esp + 4 + 16]   // width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

+    mov        eax, [esp + 4 + 4]  // src_yuy2

+    mov        edx, [esp + 4 + 8]  // dst_u

+    mov        edi, [esp + 4 + 12]  // dst_v

+    mov        ecx, [esp + 4 + 16]  // width

+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff

     psrlw      xmm5, 8

     sub        edi, edx

@@ -4082,13 +4025,13 @@

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

-    pand       xmm0, xmm5   // UYVY -> UVUV

+    pand       xmm0, xmm5  // UYVY -> UVUV

     pand       xmm1, xmm5

     packuswb   xmm0, xmm1

     movdqa     xmm1, xmm0

     pand       xmm0, xmm5  // U

     packuswb   xmm0, xmm0

-    psrlw      xmm1, 8     // V

+    psrlw      xmm1, 8  // V

     packuswb   xmm1, xmm1

     movq       qword ptr [edx], xmm0

     movq       qword ptr [edx + edi], xmm1

@@ -4108,13 +4051,15 @@

 // =((A2*C2)+(B2*(255-C2))+255)/256

 // signed version of math

 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

-__declspec(naked)

-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,

-                         const uint8* alpha, uint8* dst, int width) {

+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,

+                                           const uint8_t* src1,

+                                           const uint8_t* alpha,

+                                           uint8_t* dst,

+                                           int width) {

   __asm {

     push       esi

     push       edi

-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00

+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00

     psllw      xmm5, 8

     mov        eax, 0x80808080  // 128 for biasing image to signed.

     movd       xmm6, eax

@@ -4123,8 +4068,8 @@

     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.

     movd       xmm7, eax

     pshufd     xmm7, xmm7, 0x00

-    mov        eax, [esp + 8 + 4]   // src0

-    mov        edx, [esp + 8 + 8]   // src1

+    mov        eax, [esp + 8 + 4]  // src0

+    mov        edx, [esp + 8 + 8]  // src1

     mov        esi, [esp + 8 + 12]  // alpha

     mov        edi, [esp + 8 + 16]  // dst

     mov        ecx, [esp + 8 + 20]  // width

@@ -4132,17 +4077,17 @@

     sub        edx, esi

     sub        edi, esi

-    // 8 pixel loop.

+        // 8 pixel loop.

   convertloop8:

-    movq       xmm0, qword ptr [esi]        // alpha

+    movq       xmm0, qword ptr [esi]  // alpha

     punpcklbw  xmm0, xmm0

-    pxor       xmm0, xmm5         // a, 255-a

+    pxor       xmm0, xmm5  // a, 255-a

     movq       xmm1, qword ptr [eax + esi]  // src0

     movq       xmm2, qword ptr [edx + esi]  // src1

     punpcklbw  xmm1, xmm2

-    psubb      xmm1, xmm6         // bias src0/1 - 128

+    psubb      xmm1, xmm6  // bias src0/1 - 128

     pmaddubsw  xmm0, xmm1

-    paddw      xmm0, xmm7         // unbias result - 32768 and round.

+    paddw      xmm0, xmm7  // unbias result - 32768 and round.

     psrlw      xmm0, 8

     packuswb   xmm0, xmm0

     movq       qword ptr [edi + esi], xmm0

@@ -4163,13 +4108,15 @@

 // =((A2*C2)+(B2*(255-C2))+255)/256

 // signed version of math

 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256

-__declspec(naked)

-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,

-                         const uint8* alpha, uint8* dst, int width) {

+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,

+                                          const uint8_t* src1,

+                                          const uint8_t* alpha,

+                                          uint8_t* dst,

+                                          int width) {

   __asm {

     push        esi

     push        edi

-    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00

+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00

     vpsllw      ymm5, ymm5, 8

     mov         eax, 0x80808080  // 128 for biasing image to signed.

     vmovd       xmm6, eax

@@ -4177,8 +4124,8 @@

     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.

     vmovd       xmm7, eax

     vbroadcastss ymm7, xmm7

-    mov         eax, [esp + 8 + 4]   // src0

-    mov         edx, [esp + 8 + 8]   // src1

+    mov         eax, [esp + 8 + 4]  // src0

+    mov         edx, [esp + 8 + 8]  // src1

     mov         esi, [esp + 8 + 12]  // alpha

     mov         edi, [esp + 8 + 16]  // dst

     mov         ecx, [esp + 8 + 20]  // width

@@ -4186,23 +4133,23 @@

     sub         edx, esi

     sub         edi, esi

-    // 32 pixel loop.

+        // 32 pixel loop.

   convertloop32:

-    vmovdqu     ymm0, [esi]        // alpha

-    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31

-    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23

-    vpxor       ymm3, ymm3, ymm5   // a, 255-a

-    vpxor       ymm0, ymm0, ymm5   // a, 255-a

+    vmovdqu     ymm0, [esi]  // alpha

+    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31

+    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23

+    vpxor       ymm3, ymm3, ymm5  // a, 255-a

+    vpxor       ymm0, ymm0, ymm5  // a, 255-a

     vmovdqu     ymm1, [eax + esi]  // src0

     vmovdqu     ymm2, [edx + esi]  // src1

     vpunpckhbw  ymm4, ymm1, ymm2

     vpunpcklbw  ymm1, ymm1, ymm2

-    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128

-    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128

+    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128

+    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128

     vpmaddubsw  ymm3, ymm3, ymm4

     vpmaddubsw  ymm0, ymm0, ymm1

-    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.

-    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.

+    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.

+    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.

     vpsrlw      ymm3, ymm3, 8

     vpsrlw      ymm0, ymm0, 8

     vpackuswb   ymm0, ymm0, ymm3

@@ -4221,52 +4168,51 @@

 #ifdef HAS_ARGBBLENDROW_SSSE3

 // Shuffle table for isolating alpha.

-static const uvec8 kShuffleAlpha = {

-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,

-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80

-};

+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,

+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};

 // Blend 8 pixels at a time.

-__declspec(naked)

-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,

-                        uint8* dst_argb, int width) {

+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,

+                                          const uint8_t* src_argb1,

+                                          uint8_t* dst_argb,

+                                          int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        eax, [esp + 4 + 4]  // src_argb0

+    mov        esi, [esp + 4 + 8]  // src_argb1

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

-    pcmpeqb    xmm7, xmm7       // generate constant 0x0001

+    pcmpeqb    xmm7, xmm7  // generate constant 0x0001

     psrlw      xmm7, 15

-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff

+    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff

     psrlw      xmm6, 8

-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00

+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00

     psllw      xmm5, 8

-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000

+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000

     pslld      xmm4, 24

     sub        ecx, 4

-    jl         convertloop4b    // less than 4 pixels?

+    jl         convertloop4b  // less than 4 pixels?

-    // 4 pixel loop.

+        // 4 pixel loop.

   convertloop4:

-    movdqu     xmm3, [eax]      // src argb

+    movdqu     xmm3, [eax]  // src argb

     lea        eax, [eax + 16]

-    movdqa     xmm0, xmm3       // src argb

-    pxor       xmm3, xmm4       // ~alpha

-    movdqu     xmm2, [esi]      // _r_b

-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha

-    pand       xmm2, xmm6       // _r_b

-    paddw      xmm3, xmm7       // 256 - alpha

-    pmullw     xmm2, xmm3       // _r_b * alpha

-    movdqu     xmm1, [esi]      // _a_g

+    movdqa     xmm0, xmm3  // src argb

+    pxor       xmm3, xmm4  // ~alpha

+    movdqu     xmm2, [esi]  // _r_b

+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha

+    pand       xmm2, xmm6  // _r_b

+    paddw      xmm3, xmm7  // 256 - alpha

+    pmullw     xmm2, xmm3  // _r_b * alpha

+    movdqu     xmm1, [esi]  // _a_g

     lea        esi, [esi + 16]

-    psrlw      xmm1, 8          // _a_g

-    por        xmm0, xmm4       // set alpha to 255

-    pmullw     xmm1, xmm3       // _a_g * alpha

-    psrlw      xmm2, 8          // _r_b convert to 8 bits again

-    paddusb    xmm0, xmm2       // + src argb

-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

-    paddusb    xmm0, xmm1       // + src argb

+    psrlw      xmm1, 8  // _a_g

+    por        xmm0, xmm4  // set alpha to 255

+    pmullw     xmm1, xmm3  // _a_g * alpha

+    psrlw      xmm2, 8  // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2  // + src argb

+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1  // + src argb

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 4

@@ -4276,26 +4222,26 @@

     add        ecx, 4 - 1

     jl         convertloop1b

-    // 1 pixel loop.

+        // 1 pixel loop.

   convertloop1:

-    movd       xmm3, [eax]      // src argb

+    movd       xmm3, [eax]  // src argb

     lea        eax, [eax + 4]

-    movdqa     xmm0, xmm3       // src argb

-    pxor       xmm3, xmm4       // ~alpha

-    movd       xmm2, [esi]      // _r_b

-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha

-    pand       xmm2, xmm6       // _r_b

-    paddw      xmm3, xmm7       // 256 - alpha

-    pmullw     xmm2, xmm3       // _r_b * alpha

-    movd       xmm1, [esi]      // _a_g

+    movdqa     xmm0, xmm3  // src argb

+    pxor       xmm3, xmm4  // ~alpha

+    movd       xmm2, [esi]  // _r_b

+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha

+    pand       xmm2, xmm6  // _r_b

+    paddw      xmm3, xmm7  // 256 - alpha

+    pmullw     xmm2, xmm3  // _r_b * alpha

+    movd       xmm1, [esi]  // _a_g

     lea        esi, [esi + 4]

-    psrlw      xmm1, 8          // _a_g

-    por        xmm0, xmm4       // set alpha to 255

-    pmullw     xmm1, xmm3       // _a_g * alpha

-    psrlw      xmm2, 8          // _r_b convert to 8 bits again

-    paddusb    xmm0, xmm2       // + src argb

-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again

-    paddusb    xmm0, xmm1       // + src argb

+    psrlw      xmm1, 8  // _a_g

+    por        xmm0, xmm4  // set alpha to 255

+    pmullw     xmm1, xmm3  // _a_g * alpha

+    psrlw      xmm2, 8  // _r_b convert to 8 bits again

+    paddusb    xmm0, xmm2  // + src argb

+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again

+    paddusb    xmm0, xmm1  // + src argb

     movd       [edx], xmm0

     lea        edx, [edx + 4]

     sub        ecx, 1

@@ -4311,41 +4257,42 @@

 #ifdef HAS_ARGBATTENUATEROW_SSSE3

 // Shuffle table duplicating alpha.

 static const uvec8 kShuffleAlpha0 = {

-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,

+    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,

};

 static const uvec8 kShuffleAlpha1 = {

-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,

+    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,

+    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,

};

-__declspec(naked)

-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,

+                                              uint8_t* dst_argb,

+                                              int width) {

   __asm {

-    mov        eax, [esp + 4]   // src_argb0

-    mov        edx, [esp + 8]   // dst_argb

+    mov        eax, [esp + 4]  // src_argb0

+    mov        edx, [esp + 8]  // dst_argb

     mov        ecx, [esp + 12]  // width

-    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000

+    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000

     pslld      xmm3, 24

     movdqa     xmm4, xmmword ptr kShuffleAlpha0

     movdqa     xmm5, xmmword ptr kShuffleAlpha1

  convertloop:

-    movdqu     xmm0, [eax]      // read 4 pixels

-    pshufb     xmm0, xmm4       // isolate first 2 alphas

-    movdqu     xmm1, [eax]      // read 4 pixels

-    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs

-    pmulhuw    xmm0, xmm1       // rgb * a

-    movdqu     xmm1, [eax]      // read 4 pixels

-    pshufb     xmm1, xmm5       // isolate next 2 alphas

-    movdqu     xmm2, [eax]      // read 4 pixels

-    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs

-    pmulhuw    xmm1, xmm2       // rgb * a

-    movdqu     xmm2, [eax]      // mask original alpha

+    movdqu     xmm0, [eax]  // read 4 pixels

+    pshufb     xmm0, xmm4  // isolate first 2 alphas

+    movdqu     xmm1, [eax]  // read 4 pixels

+    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs

+    pmulhuw    xmm0, xmm1  // rgb * a

+    movdqu     xmm1, [eax]  // read 4 pixels

+    pshufb     xmm1, xmm5  // isolate next 2 alphas

+    movdqu     xmm2, [eax]  // read 4 pixels

+    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs

+    pmulhuw    xmm1, xmm2  // rgb * a

+    movdqu     xmm2, [eax]  // mask original alpha

     lea        eax, [eax + 16]

     pand       xmm2, xmm3

     psrlw      xmm0, 8

     psrlw      xmm1, 8

     packuswb   xmm0, xmm1

-    por        xmm0, xmm2       // copy original alpha

+    por        xmm0, xmm2  // copy original alpha

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 4

@@ -4358,22 +4305,23 @@

 #ifdef HAS_ARGBATTENUATEROW_AVX2

 // Shuffle table duplicating alpha.

-static const uvec8 kShuffleAlpha_AVX2 = {

-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u

-};

-__declspec(naked)

-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {

+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,

+                                         128u, 128u, 14u,  15u, 14u, 15u,

+                                         14u,  15u,  128u, 128u};

+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,

+                                             uint8_t* dst_argb,

+                                             int width) {

   __asm {

-    mov        eax, [esp + 4]   // src_argb0

-    mov        edx, [esp + 8]   // dst_argb

+    mov        eax, [esp + 4]  // src_argb0

+    mov        edx, [esp + 8]  // dst_argb

     mov        ecx, [esp + 12]  // width

     sub        edx, eax

     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2

-    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000

+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000

     vpslld     ymm5, ymm5, 24

  convertloop:

-    vmovdqu    ymm6, [eax]       // read 8 pixels.

+    vmovdqu    ymm6, [eax]  // read 8 pixels.

     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.

     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.

     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas

@@ -4398,40 +4346,40 @@

 #ifdef HAS_ARGBUNATTENUATEROW_SSE2

 // Unattenuate 4 pixels at a time.

-__declspec(naked)

-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,

-                             int width) {

+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,

+                                               uint8_t* dst_argb,

+                                               int width) {

   __asm {

     push       ebx

     push       esi

     push       edi

-    mov        eax, [esp + 12 + 4]   // src_argb

-    mov        edx, [esp + 12 + 8]   // dst_argb

+    mov        eax, [esp + 12 + 4]  // src_argb

+    mov        edx, [esp + 12 + 8]  // dst_argb

     mov        ecx, [esp + 12 + 12]  // width

     lea        ebx, fixed_invtbl8

  convertloop:

-    movdqu     xmm0, [eax]      // read 4 pixels

+    movdqu     xmm0, [eax]  // read 4 pixels

     movzx      esi, byte ptr [eax + 3]  // first alpha

     movzx      edi, byte ptr [eax + 7]  // second alpha

-    punpcklbw  xmm0, xmm0       // first 2

+    punpcklbw  xmm0, xmm0  // first 2

     movd       xmm2, dword ptr [ebx + esi * 4]

     movd       xmm3, dword ptr [ebx + edi * 4]

-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a

-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words

+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a

+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words

     movlhps    xmm2, xmm3

-    pmulhuw    xmm0, xmm2       // rgb * a

+    pmulhuw    xmm0, xmm2  // rgb * a

-    movdqu     xmm1, [eax]      // read 4 pixels

+    movdqu     xmm1, [eax]  // read 4 pixels

     movzx      esi, byte ptr [eax + 11]  // third alpha

     movzx      edi, byte ptr [eax + 15]  // forth alpha

-    punpckhbw  xmm1, xmm1       // next 2

+    punpckhbw  xmm1, xmm1  // next 2

     movd       xmm2, dword ptr [ebx + esi * 4]

     movd       xmm3, dword ptr [ebx + edi * 4]

-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words

-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words

+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words

+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words

     movlhps    xmm2, xmm3

-    pmulhuw    xmm1, xmm2       // rgb * a

+    pmulhuw    xmm1, xmm2  // rgb * a

     lea        eax, [eax + 16]

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

@@ -4450,25 +4398,24 @@

 #ifdef HAS_ARGBUNATTENUATEROW_AVX2

 // Shuffle table duplicating alpha.

 static const uvec8 kUnattenShuffleAlpha_AVX2 = {

-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u

-};

+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};

 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.

 // USE_GATHER is not on by default, due to being a slow instruction.

 #ifdef USE_GATHER

-__declspec(naked)

-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                             int width) {

+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,

+                                               uint8_t* dst_argb,

+                                               int width) {

   __asm {

-    mov        eax, [esp + 4]   // src_argb0

-    mov        edx, [esp + 8]   // dst_argb

+    mov        eax, [esp + 4]  // src_argb0

+    mov        edx, [esp + 8]  // dst_argb

     mov        ecx, [esp + 12]  // width

     sub        edx, eax

     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2

  convertloop:

-    vmovdqu    ymm6, [eax]       // read 8 pixels.

+    vmovdqu    ymm6, [eax]  // read 8 pixels.

     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.

-    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.

+    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.

     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.

     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.

     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a

@@ -4488,17 +4435,17 @@

ret

-#else  // USE_GATHER

-__declspec(naked)

-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                             int width) {

+#else   // USE_GATHER

+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,

+                                               uint8_t* dst_argb,

+                                               int width) {

   __asm {

     push       ebx

     push       esi

     push       edi

-    mov        eax, [esp + 12 + 4]   // src_argb

-    mov        edx, [esp + 12 + 8]   // dst_argb

+    mov        eax, [esp + 12 + 4]  // src_argb

+    mov        edx, [esp + 12 + 8]  // dst_argb

     mov        ecx, [esp + 12 + 12]  // width

     sub        edx, eax

     lea        ebx, fixed_invtbl8

@@ -4505,33 +4452,33 @@

     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2

  convertloop:

-    // replace VPGATHER

-    movzx      esi, byte ptr [eax + 3]                 // alpha0

-    movzx      edi, byte ptr [eax + 7]                 // alpha1

+        // replace VPGATHER

+    movzx      esi, byte ptr [eax + 3]  // alpha0

+    movzx      edi, byte ptr [eax + 7]  // alpha1

     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]

     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]

-    movzx      esi, byte ptr [eax + 11]                // alpha2

-    movzx      edi, byte ptr [eax + 15]                // alpha3

-    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]

+    movzx      esi, byte ptr [eax + 11]  // alpha2

+    movzx      edi, byte ptr [eax + 15]  // alpha3

+    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]

     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]

     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]

-    movzx      esi, byte ptr [eax + 19]                // alpha4

-    movzx      edi, byte ptr [eax + 23]                // alpha5

-    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]

+    movzx      esi, byte ptr [eax + 19]  // alpha4

+    movzx      edi, byte ptr [eax + 23]  // alpha5

+    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]

     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]

     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]

-    movzx      esi, byte ptr [eax + 27]                // alpha6

-    movzx      edi, byte ptr [eax + 31]                // alpha7

-    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]

+    movzx      esi, byte ptr [eax + 27]  // alpha6

+    movzx      edi, byte ptr [eax + 31]  // alpha7

+    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]

     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]

     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]

-    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]

-    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]

-    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]

-    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]

+    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]

+    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]

+    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]

+    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]

     // end of VPGATHER

-    vmovdqu    ymm6, [eax]       // read 8 pixels.

+    vmovdqu    ymm6, [eax]  // read 8 pixels.

     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.

     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.

     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a

@@ -4540,7 +4487,7 @@

     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas

     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia

     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia

-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.

+    vpackuswb  ymm0, ymm0, ymm1             // unmutated.

     vmovdqu    [eax + edx], ymm0

     lea        eax, [eax + 32]

     sub        ecx, 8

@@ -4558,12 +4505,13 @@

 #ifdef HAS_ARGBGRAYROW_SSSE3

 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.

-__declspec(naked)

-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {

+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,

+                                         uint8_t* dst_argb,

+                                         int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_argb */

-    mov        ecx, [esp + 12]  /* width */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_argb */

+    mov        ecx, [esp + 12] /* width */

     movdqa     xmm4, xmmword ptr kARGBToYJ

     movdqa     xmm5, xmmword ptr kAddYJ64

@@ -4575,7 +4523,7 @@

     phaddw     xmm0, xmm1

     paddw      xmm0, xmm5  // Add .5 for rounding.

     psrlw      xmm0, 7

-    packuswb   xmm0, xmm0   // 8 G bytes

+    packuswb   xmm0, xmm0  // 8 G bytes

     movdqu     xmm2, [eax]  // A

     movdqu     xmm3, [eax + 16]

     lea        eax, [eax + 32]

@@ -4582,13 +4530,13 @@

     psrld      xmm2, 24

     psrld      xmm3, 24

     packuswb   xmm2, xmm3

-    packuswb   xmm2, xmm2   // 8 A bytes

-    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA

-    punpcklbw  xmm0, xmm0   // 8 GG words

-    punpcklbw  xmm3, xmm2   // 8 GA words

+    packuswb   xmm2, xmm2  // 8 A bytes

+    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA

+    punpcklbw  xmm0, xmm0  // 8 GG words

+    punpcklbw  xmm3, xmm2  // 8 GA words

     movdqa     xmm1, xmm0

-    punpcklwd  xmm0, xmm3   // GGGA first 4

-    punpckhwd  xmm1, xmm3   // GGGA next 4

+    punpcklwd  xmm0, xmm3  // GGGA first 4

+    punpckhwd  xmm1, xmm3  // GGGA next 4

     movdqu     [edx], xmm0

     movdqu     [edx + 16], xmm1

     lea        edx, [edx + 32]

@@ -4604,24 +4552,20 @@

 //    g = (r * 45 + g * 88 + b * 22) >> 7

 //    r = (r * 50 + g * 98 + b * 24) >> 7

 // Constant for ARGB color to sepia tone.

-static const vec8 kARGBToSepiaB = {

-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0

-};

+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,

+                                   17, 68, 35, 0, 17, 68, 35, 0};

-static const vec8 kARGBToSepiaG = {

-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0

-};

+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,

+                                   22, 88, 45, 0, 22, 88, 45, 0};

-static const vec8 kARGBToSepiaR = {

-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0

-};

+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,

+                                   24, 98, 50, 0, 24, 98, 50, 0};

 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.

-__declspec(naked)

-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {

+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {

   __asm {

-    mov        eax, [esp + 4]   /* dst_argb */

-    mov        ecx, [esp + 8]   /* width */

+    mov        eax, [esp + 4] /* dst_argb */

+    mov        ecx, [esp + 8] /* width */

     movdqa     xmm2, xmmword ptr kARGBToSepiaB

     movdqa     xmm3, xmmword ptr kARGBToSepiaG

     movdqa     xmm4, xmmword ptr kARGBToSepiaR

@@ -4633,7 +4577,7 @@

     pmaddubsw  xmm6, xmm2

     phaddw     xmm0, xmm6

     psrlw      xmm0, 7

-    packuswb   xmm0, xmm0   // 8 B values

+    packuswb   xmm0, xmm0  // 8 B values

     movdqu     xmm5, [eax]  // G

     movdqu     xmm1, [eax + 16]

     pmaddubsw  xmm5, xmm3

@@ -4640,8 +4584,8 @@

     pmaddubsw  xmm1, xmm3

     phaddw     xmm5, xmm1

     psrlw      xmm5, 7

-    packuswb   xmm5, xmm5   // 8 G values

-    punpcklbw  xmm0, xmm5   // 8 BG values

+    packuswb   xmm5, xmm5  // 8 G values

+    punpcklbw  xmm0, xmm5  // 8 BG values

     movdqu     xmm5, [eax]  // R

     movdqu     xmm1, [eax + 16]

     pmaddubsw  xmm5, xmm4

@@ -4648,17 +4592,17 @@

     pmaddubsw  xmm1, xmm4

     phaddw     xmm5, xmm1

     psrlw      xmm5, 7

-    packuswb   xmm5, xmm5   // 8 R values

+    packuswb   xmm5, xmm5  // 8 R values

     movdqu     xmm6, [eax]  // A

     movdqu     xmm1, [eax + 16]

     psrld      xmm6, 24

     psrld      xmm1, 24

     packuswb   xmm6, xmm1

-    packuswb   xmm6, xmm6   // 8 A values

-    punpcklbw  xmm5, xmm6   // 8 RA values

-    movdqa     xmm1, xmm0   // Weave BG, RA together

-    punpcklwd  xmm0, xmm5   // BGRA first 4

-    punpckhwd  xmm1, xmm5   // BGRA next 4

+    packuswb   xmm6, xmm6  // 8 A values

+    punpcklbw  xmm5, xmm6  // 8 RA values

+    movdqa     xmm1, xmm0  // Weave BG, RA together

+    punpcklwd  xmm0, xmm5  // BGRA first 4

+    punpckhwd  xmm1, xmm5  // BGRA next 4

     movdqu     [eax], xmm0

     movdqu     [eax + 16], xmm1

     lea        eax, [eax + 32]

@@ -4674,19 +4618,20 @@

 // Same as Sepia except matrix is provided.

 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R

 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.

-__declspec(naked)

-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                              const int8* matrix_argb, int width) {

+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,

+                                                uint8_t* dst_argb,

+                                                const int8_t* matrix_argb,

+                                                int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_argb */

-    mov        ecx, [esp + 12]  /* matrix_argb */

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_argb */

+    mov        ecx, [esp + 12] /* matrix_argb */

     movdqu     xmm5, [ecx]

     pshufd     xmm2, xmm5, 0x00

     pshufd     xmm3, xmm5, 0x55

     pshufd     xmm4, xmm5, 0xaa

     pshufd     xmm5, xmm5, 0xff

-    mov        ecx, [esp + 16]  /* width */

+    mov        ecx, [esp + 16] /* width */

  convertloop:

     movdqu     xmm0, [eax]  // B

@@ -4697,31 +4642,31 @@

     movdqu     xmm1, [eax + 16]

     pmaddubsw  xmm6, xmm3

     pmaddubsw  xmm1, xmm3

-    phaddsw    xmm0, xmm7   // B

-    phaddsw    xmm6, xmm1   // G

-    psraw      xmm0, 6      // B

-    psraw      xmm6, 6      // G

-    packuswb   xmm0, xmm0   // 8 B values

-    packuswb   xmm6, xmm6   // 8 G values

-    punpcklbw  xmm0, xmm6   // 8 BG values

+    phaddsw    xmm0, xmm7  // B

+    phaddsw    xmm6, xmm1  // G

+    psraw      xmm0, 6  // B

+    psraw      xmm6, 6  // G

+    packuswb   xmm0, xmm0  // 8 B values

+    packuswb   xmm6, xmm6  // 8 G values

+    punpcklbw  xmm0, xmm6  // 8 BG values

     movdqu     xmm1, [eax]  // R

     movdqu     xmm7, [eax + 16]

     pmaddubsw  xmm1, xmm4

     pmaddubsw  xmm7, xmm4

-    phaddsw    xmm1, xmm7   // R

+    phaddsw    xmm1, xmm7  // R

     movdqu     xmm6, [eax]  // A

     movdqu     xmm7, [eax + 16]

     pmaddubsw  xmm6, xmm5

     pmaddubsw  xmm7, xmm5

-    phaddsw    xmm6, xmm7   // A

-    psraw      xmm1, 6      // R

-    psraw      xmm6, 6      // A

-    packuswb   xmm1, xmm1   // 8 R values

-    packuswb   xmm6, xmm6   // 8 A values

-    punpcklbw  xmm1, xmm6   // 8 RA values

-    movdqa     xmm6, xmm0   // Weave BG, RA together

-    punpcklwd  xmm0, xmm1   // BGRA first 4

-    punpckhwd  xmm6, xmm1   // BGRA next 4

+    phaddsw    xmm6, xmm7  // A

+    psraw      xmm1, 6  // R

+    psraw      xmm6, 6  // A

+    packuswb   xmm1, xmm1  // 8 R values

+    packuswb   xmm6, xmm6  // 8 A values

+    punpcklbw  xmm1, xmm6  // 8 RA values

+    movdqa     xmm6, xmm0  // Weave BG, RA together

+    punpcklwd  xmm0, xmm1  // BGRA first 4

+    punpckhwd  xmm6, xmm1  // BGRA next 4

     movdqu     [edx], xmm0

     movdqu     [edx + 16], xmm6

     lea        eax, [eax + 32]

@@ -4735,15 +4680,17 @@

 #ifdef HAS_ARGBQUANTIZEROW_SSE2

 // Quantize 4 ARGB pixels (16 bytes).

-__declspec(naked)

-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,

-                          int interval_offset, int width) {

+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,

+                                            int scale,

+                                            int interval_size,

+                                            int interval_offset,

+                                            int width) {

   __asm {

-    mov        eax, [esp + 4]    /* dst_argb */

-    movd       xmm2, [esp + 8]   /* scale */

-    movd       xmm3, [esp + 12]  /* interval_size */

-    movd       xmm4, [esp + 16]  /* interval_offset */

-    mov        ecx, [esp + 20]   /* width */

+    mov        eax, [esp + 4] /* dst_argb */

+    movd       xmm2, [esp + 8] /* scale */

+    movd       xmm3, [esp + 12] /* interval_size */

+    movd       xmm4, [esp + 16] /* interval_offset */

+    mov        ecx, [esp + 20] /* width */

     pshuflw    xmm2, xmm2, 040h

     pshufd     xmm2, xmm2, 044h

     pshuflw    xmm3, xmm3, 040h

@@ -4756,16 +4703,16 @@

  convertloop:

     movdqu     xmm0, [eax]  // read 4 pixels

-    punpcklbw  xmm0, xmm5   // first 2 pixels

-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16

+    punpcklbw  xmm0, xmm5  // first 2 pixels

+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16

     movdqu     xmm1, [eax]  // read 4 pixels

-    punpckhbw  xmm1, xmm5   // next 2 pixels

+    punpckhbw  xmm1, xmm5  // next 2 pixels

     pmulhuw    xmm1, xmm2

-    pmullw     xmm0, xmm3   // * interval_size

+    pmullw     xmm0, xmm3  // * interval_size

     movdqu     xmm7, [eax]  // read 4 pixels

     pmullw     xmm1, xmm3

-    pand       xmm7, xmm6   // mask alpha

-    paddw      xmm0, xmm4   // + interval_size / 2

+    pand       xmm7, xmm6  // mask alpha

+    paddw      xmm0, xmm4  // + interval_size / 2

     paddw      xmm1, xmm4

     packuswb   xmm0, xmm1

     por        xmm0, xmm7

@@ -4780,12 +4727,13 @@

 #ifdef HAS_ARGBSHADEROW_SSE2

 // Shade 4 pixels at a time by specified value.

-__declspec(naked)

-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,

-                       uint32 value) {

+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,

+                                         uint8_t* dst_argb,

+                                         int width,

+                                         uint32_t value) {

   __asm {

-    mov        eax, [esp + 4]   // src_argb

-    mov        edx, [esp + 8]   // dst_argb

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_argb

     mov        ecx, [esp + 12]  // width

     movd       xmm2, [esp + 16]  // value

     punpcklbw  xmm2, xmm2

@@ -4792,13 +4740,13 @@

     punpcklqdq xmm2, xmm2

  convertloop:

-    movdqu     xmm0, [eax]      // read 4 pixels

+    movdqu     xmm0, [eax]  // read 4 pixels

     lea        eax, [eax + 16]

     movdqa     xmm1, xmm0

-    punpcklbw  xmm0, xmm0       // first 2

-    punpckhbw  xmm1, xmm1       // next 2

-    pmulhuw    xmm0, xmm2       // argb * value

-    pmulhuw    xmm1, xmm2       // argb * value

+    punpcklbw  xmm0, xmm0  // first 2

+    punpckhbw  xmm1, xmm1  // next 2

+    pmulhuw    xmm0, xmm2  // argb * value

+    pmulhuw    xmm1, xmm2  // argb * value

     psrlw      xmm0, 8

     psrlw      xmm1, 8

     packuswb   xmm0, xmm1

@@ -4814,28 +4762,29 @@

 #ifdef HAS_ARGBMULTIPLYROW_SSE2

 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.

-__declspec(naked)

-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,

+                                            const uint8_t* src_argb1,

+                                            uint8_t* dst_argb,

+                                            int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        eax, [esp + 4 + 4]  // src_argb0

+    mov        esi, [esp + 4 + 8]  // src_argb1

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

     pxor       xmm5, xmm5  // constant 0

  convertloop:

-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0

-    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1

+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0

+    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1

     movdqu     xmm1, xmm0

     movdqu     xmm3, xmm2

-    punpcklbw  xmm0, xmm0         // first 2

-    punpckhbw  xmm1, xmm1         // next 2

-    punpcklbw  xmm2, xmm5         // first 2

-    punpckhbw  xmm3, xmm5         // next 2

-    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2

-    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2

+    punpcklbw  xmm0, xmm0  // first 2

+    punpckhbw  xmm1, xmm1  // next 2

+    punpcklbw  xmm2, xmm5  // first 2

+    punpckhbw  xmm3, xmm5  // next 2

+    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2

+    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2

     lea        eax, [eax + 16]

     lea        esi, [esi + 16]

     packuswb   xmm0, xmm1

@@ -4853,13 +4802,14 @@

 #ifdef HAS_ARGBADDROW_SSE2

 // Add 2 rows of ARGB pixels together, 4 pixels at a time.

 // TODO(fbarchard): Port this to posix, neon and other math functions.

-__declspec(naked)

-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                     uint8* dst_argb, int width) {

+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,

+                                       const uint8_t* src_argb1,

+                                       uint8_t* dst_argb,

+                                       int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        eax, [esp + 4 + 4]  // src_argb0

+    mov        esi, [esp + 4 + 8]  // src_argb1

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

@@ -4867,11 +4817,11 @@

     jl         convertloop49

  convertloop4:

-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0

+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0

     lea        eax, [eax + 16]

-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1

+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1

     lea        esi, [esi + 16]

-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1

+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 4

@@ -4882,11 +4832,11 @@

     jl         convertloop19

  convertloop1:

-    movd       xmm0, [eax]        // read 1 pixels from src_argb0

+    movd       xmm0, [eax]  // read 1 pixels from src_argb0

     lea        eax, [eax + 4]

-    movd       xmm1, [esi]        // read 1 pixels from src_argb1

+    movd       xmm1, [esi]  // read 1 pixels from src_argb1

     lea        esi, [esi + 4]

-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1

+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1

     movd       [edx], xmm0

     lea        edx, [edx + 4]

     sub        ecx, 1

@@ -4901,22 +4851,23 @@

 #ifdef HAS_ARGBSUBTRACTROW_SSE2

 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.

-__declspec(naked)

-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,

+                                            const uint8_t* src_argb1,

+                                            uint8_t* dst_argb,

+                                            int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        eax, [esp + 4 + 4]  // src_argb0

+    mov        esi, [esp + 4 + 8]  // src_argb1

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

  convertloop:

-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0

+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0

     lea        eax, [eax + 16]

-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1

+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1

     lea        esi, [esi + 16]

-    psubusb    xmm0, xmm1         // src_argb0 - src_argb1

+    psubusb    xmm0, xmm1  // src_argb0 - src_argb1

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 4

@@ -4930,28 +4881,29 @@

 #ifdef HAS_ARGBMULTIPLYROW_AVX2

 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.

-__declspec(naked)

-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,

+                                            const uint8_t* src_argb1,

+                                            uint8_t* dst_argb,

+                                            int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        eax, [esp + 4 + 4]  // src_argb0

+    mov        esi, [esp + 4 + 8]  // src_argb1

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

-    vpxor      ymm5, ymm5, ymm5     // constant 0

+    vpxor      ymm5, ymm5, ymm5  // constant 0

  convertloop:

-    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0

+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0

     lea        eax, [eax + 32]

-    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1

+    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1

     lea        esi, [esi + 32]

-    vpunpcklbw ymm0, ymm1, ymm1   // low 4

-    vpunpckhbw ymm1, ymm1, ymm1   // high 4

-    vpunpcklbw ymm2, ymm3, ymm5   // low 4

-    vpunpckhbw ymm3, ymm3, ymm5   // high 4

-    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4

-    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4

+    vpunpcklbw ymm0, ymm1, ymm1  // low 4

+    vpunpckhbw ymm1, ymm1, ymm1  // high 4

+    vpunpcklbw ymm2, ymm3, ymm5  // low 4

+    vpunpckhbw ymm3, ymm3, ymm5  // high 4

+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4

+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4

     vpackuswb  ymm0, ymm0, ymm1

     vmovdqu    [edx], ymm0

     lea        edx, [edx + 32]

@@ -4967,20 +4919,21 @@

 #ifdef HAS_ARGBADDROW_AVX2

 // Add 2 rows of ARGB pixels together, 8 pixels at a time.

-__declspec(naked)

-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

-                     uint8* dst_argb, int width) {

+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,

+                                       const uint8_t* src_argb1,

+                                       uint8_t* dst_argb,

+                                       int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        eax, [esp + 4 + 4]  // src_argb0

+    mov        esi, [esp + 4 + 8]  // src_argb1

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

  convertloop:

-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0

+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0

     lea        eax, [eax + 32]

-    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1

+    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1

     lea        esi, [esi + 32]

     vmovdqu    [edx], ymm0

     lea        edx, [edx + 32]

@@ -4996,20 +4949,21 @@

 #ifdef HAS_ARGBSUBTRACTROW_AVX2

 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.

-__declspec(naked)

-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,

-                          uint8* dst_argb, int width) {

+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,

+                                            const uint8_t* src_argb1,

+                                            uint8_t* dst_argb,

+                                            int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_argb0

-    mov        esi, [esp + 4 + 8]   // src_argb1

+    mov        eax, [esp + 4 + 4]  // src_argb0

+    mov        esi, [esp + 4 + 8]  // src_argb1

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

  convertloop:

-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0

+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0

     lea        eax, [eax + 32]

-    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1

+    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1

     lea        esi, [esi + 32]

     vmovdqu    [edx], ymm0

     lea        edx, [edx + 32]

@@ -5028,14 +4982,16 @@

 // -1  0  1

 // -2  0  2

 // -1  0  1

-__declspec(naked)

-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,

-                    const uint8* src_y2, uint8* dst_sobelx, int width) {

+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,

+                                      const uint8_t* src_y1,

+                                      const uint8_t* src_y2,

+                                      uint8_t* dst_sobelx,

+                                      int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   // src_y0

-    mov        esi, [esp + 8 + 8]   // src_y1

+    mov        eax, [esp + 8 + 4]  // src_y0

+    mov        esi, [esp + 8 + 8]  // src_y1

     mov        edi, [esp + 8 + 12]  // src_y2

     mov        edx, [esp + 8 + 16]  // dst_sobelx

     mov        ecx, [esp + 8 + 20]  // width

@@ -5045,17 +5001,17 @@

     pxor       xmm5, xmm5  // constant 0

  convertloop:

-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]

-    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]

+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]

+    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]

     punpcklbw  xmm0, xmm5

     punpcklbw  xmm1, xmm5

     psubw      xmm0, xmm1

-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]

+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]

     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]

     punpcklbw  xmm1, xmm5

     punpcklbw  xmm2, xmm5

     psubw      xmm1, xmm2

-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]

+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]

     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]

     punpcklbw  xmm2, xmm5

     punpcklbw  xmm3, xmm5

@@ -5063,7 +5019,7 @@

     paddw      xmm0, xmm2

     paddw      xmm0, xmm1

     paddw      xmm0, xmm1

-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw

+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw

     psubw      xmm1, xmm0

     pmaxsw     xmm0, xmm1

     packuswb   xmm0, xmm0

@@ -5084,13 +5040,14 @@

 // -1 -2 -1

 //  0  0  0

 //  1  2  1

-__declspec(naked)

-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,

-                    uint8* dst_sobely, int width) {

+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,

+                                      const uint8_t* src_y1,

+                                      uint8_t* dst_sobely,

+                                      int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_y0

-    mov        esi, [esp + 4 + 8]   // src_y1

+    mov        eax, [esp + 4 + 4]  // src_y0

+    mov        esi, [esp + 4 + 8]  // src_y1

     mov        edx, [esp + 4 + 12]  // dst_sobely

     mov        ecx, [esp + 4 + 16]  // width

     sub        esi, eax

@@ -5098,17 +5055,17 @@

     pxor       xmm5, xmm5  // constant 0

  convertloop:

-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]

-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]

+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]

+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]

     punpcklbw  xmm0, xmm5

     punpcklbw  xmm1, xmm5

     psubw      xmm0, xmm1

-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]

+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]

     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]

     punpcklbw  xmm1, xmm5

     punpcklbw  xmm2, xmm5

     psubw      xmm1, xmm2

-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]

+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]

     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]

     punpcklbw  xmm2, xmm5

     punpcklbw  xmm3, xmm5

@@ -5116,7 +5073,7 @@

     paddw      xmm0, xmm2

     paddw      xmm0, xmm1

     paddw      xmm0, xmm1

-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw

+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw

     psubw      xmm1, xmm0

     pmaxsw     xmm0, xmm1

     packuswb   xmm0, xmm0

@@ -5137,36 +5094,37 @@

 // R = Sobel

 // G = Sobel

 // B = Sobel

-__declspec(naked)

-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                   uint8* dst_argb, int width) {

+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,

+                                     const uint8_t* src_sobely,

+                                     uint8_t* dst_argb,

+                                     int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_sobelx

-    mov        esi, [esp + 4 + 8]   // src_sobely

+    mov        eax, [esp + 4 + 4]  // src_sobelx

+    mov        esi, [esp + 4 + 8]  // src_sobely

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

     sub        esi, eax

-    pcmpeqb    xmm5, xmm5           // alpha 255

-    pslld      xmm5, 24             // 0xff000000

+    pcmpeqb    xmm5, xmm5  // alpha 255

+    pslld      xmm5, 24  // 0xff000000

  convertloop:

-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx

-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely

+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx

+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely

     lea        eax, [eax + 16]

-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely

-    movdqa     xmm2, xmm0             // GG

-    punpcklbw  xmm2, xmm0             // First 8

-    punpckhbw  xmm0, xmm0             // Next 8

-    movdqa     xmm1, xmm2             // GGGG

-    punpcklwd  xmm1, xmm2             // First 4

-    punpckhwd  xmm2, xmm2             // Next 4

-    por        xmm1, xmm5             // GGGA

+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely

+    movdqa     xmm2, xmm0  // GG

+    punpcklbw  xmm2, xmm0  // First 8

+    punpckhbw  xmm0, xmm0  // Next 8

+    movdqa     xmm1, xmm2  // GGGG

+    punpcklwd  xmm1, xmm2  // First 4

+    punpckhwd  xmm2, xmm2  // Next 4

+    por        xmm1, xmm5  // GGGA

     por        xmm2, xmm5

-    movdqa     xmm3, xmm0             // GGGG

-    punpcklwd  xmm3, xmm0             // Next 4

-    punpckhwd  xmm0, xmm0             // Last 4

-    por        xmm3, xmm5             // GGGA

+    movdqa     xmm3, xmm0  // GGGG

+    punpcklwd  xmm3, xmm0  // Next 4

+    punpckhwd  xmm0, xmm0  // Last 4

+    por        xmm3, xmm5  // GGGA

     por        xmm0, xmm5

     movdqu     [edx], xmm1

     movdqu     [edx + 16], xmm2

@@ -5184,22 +5142,23 @@

 #ifdef HAS_SOBELTOPLANEROW_SSE2

 // Adds Sobel X and Sobel Y and stores Sobel into a plane.

-__declspec(naked)

-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                          uint8* dst_y, int width) {

+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,

+                                            const uint8_t* src_sobely,

+                                            uint8_t* dst_y,

+                                            int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_sobelx

-    mov        esi, [esp + 4 + 8]   // src_sobely

+    mov        eax, [esp + 4 + 4]  // src_sobelx

+    mov        esi, [esp + 4 + 8]  // src_sobely

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

     sub        esi, eax

  convertloop:

-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx

-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely

+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx

+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely

     lea        eax, [eax + 16]

-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely

+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

     sub        ecx, 16

@@ -5217,36 +5176,37 @@

 // R = Sobel X

 // G = Sobel

 // B = Sobel Y

-__declspec(naked)

-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,

-                     uint8* dst_argb, int width) {

+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,

+                                       const uint8_t* src_sobely,

+                                       uint8_t* dst_argb,

+                                       int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   // src_sobelx

-    mov        esi, [esp + 4 + 8]   // src_sobely

+    mov        eax, [esp + 4 + 4]  // src_sobelx

+    mov        esi, [esp + 4 + 8]  // src_sobely

     mov        edx, [esp + 4 + 12]  // dst_argb

     mov        ecx, [esp + 4 + 16]  // width

     sub        esi, eax

-    pcmpeqb    xmm5, xmm5           // alpha 255

+    pcmpeqb    xmm5, xmm5  // alpha 255

  convertloop:

-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx

-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely

+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx

+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely

     lea        eax, [eax + 16]

     movdqa     xmm2, xmm0

-    paddusb    xmm2, xmm1             // sobel = sobelx + sobely

-    movdqa     xmm3, xmm0             // XA

+    paddusb    xmm2, xmm1  // sobel = sobelx + sobely

+    movdqa     xmm3, xmm0  // XA

     punpcklbw  xmm3, xmm5

     punpckhbw  xmm0, xmm5

-    movdqa     xmm4, xmm1             // YS

+    movdqa     xmm4, xmm1  // YS

     punpcklbw  xmm4, xmm2

     punpckhbw  xmm1, xmm2

-    movdqa     xmm6, xmm4             // YSXA

-    punpcklwd  xmm6, xmm3             // First 4

-    punpckhwd  xmm4, xmm3             // Next 4

-    movdqa     xmm7, xmm1             // YSXA

-    punpcklwd  xmm7, xmm0             // Next 4

-    punpckhwd  xmm1, xmm0             // Last 4

+    movdqa     xmm6, xmm4  // YSXA

+    punpcklwd  xmm6, xmm3  // First 4

+    punpckhwd  xmm4, xmm3  // Next 4

+    movdqa     xmm7, xmm1  // YSXA

+    punpcklwd  xmm7, xmm0  // Next 4

+    punpckhwd  xmm1, xmm0  // Last 4

     movdqu     [edx], xmm6

     movdqu     [edx + 16], xmm4

     movdqu     [edx + 32], xmm7

@@ -5275,8 +5235,11 @@

 // count is number of averaged pixels to produce.

 // Does 4 pixels at a time.

 // This function requires alignment on accumulation buffer pointers.

-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,

-                                    int width, int area, uint8* dst,

+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,

+                                    const int32_t* botleft,

+                                    int width,

+                                    int area,

+                                    uint8_t* dst,

                                     int count) {

   __asm {

     mov        eax, topleft  // eax topleft

@@ -5294,18 +5257,18 @@

     cmp        area, 128  // 128 pixels will not overflow 15 bits.

     ja         l4

-    pshufd     xmm5, xmm5, 0        // area

-    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0

+    pshufd     xmm5, xmm5, 0  // area

+    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0

     psrld      xmm6, 16

     cvtdq2ps   xmm6, xmm6

-    addps      xmm5, xmm6           // (65536.0 + area - 1)

-    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area

-    cvtps2dq   xmm5, xmm5           // 0.16 fixed point

-    packssdw   xmm5, xmm5           // 16 bit shorts

+    addps      xmm5, xmm6  // (65536.0 + area - 1)

+    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area

+    cvtps2dq   xmm5, xmm5  // 0.16 fixed point

+    packssdw   xmm5, xmm5  // 16 bit shorts

-    // 4 pixel loop small blocks.

+        // 4 pixel loop small blocks.

s4:

-    // top left

+        // top left

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     movdqu     xmm2, [eax + 32]

@@ -5345,9 +5308,9 @@

     jmp        l4b

-    // 4 pixel loop

+            // 4 pixel loop

l4:

-    // top left

+        // top left

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     movdqu     xmm2, [eax + 32]

@@ -5373,7 +5336,7 @@

     paddd      xmm3, [esi + edx * 4 + 48]

     lea        esi, [esi + 64]

-    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area

+    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area

     cvtdq2ps   xmm1, xmm1

     mulps      xmm0, xmm4

     mulps      xmm1, xmm4

@@ -5397,7 +5360,7 @@

     add        ecx, 4 - 1

     jl         l1b

-    // 1 pixel loop

+        // 1 pixel loop

l1:

     movdqu     xmm0, [eax]

     psubd      xmm0, [eax + edx * 4]

@@ -5422,8 +5385,10 @@

 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2

 // Creates a table of cumulative sums where each value is a sum of all values

 // above and to the left of the value.

-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,

-                                  const int32* previous_cumsum, int width) {

+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,

+                                  int32_t* cumsum,

+                                  const int32_t* previous_cumsum,

+                                  int width) {

   __asm {

     mov        eax, row

     mov        edx, cumsum

@@ -5437,7 +5402,7 @@

     test       edx, 15

     jne        l4b

-    // 4 pixel loop

+        // 4 pixel loop

l4:

     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.

     lea        eax, [eax + 16]

@@ -5483,7 +5448,7 @@

     add        ecx, 4 - 1

     jl         l1b

-    // 1 pixel loop

+        // 1 pixel loop

l1:

     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.

     lea        eax, [eax + 4]

@@ -5505,10 +5470,11 @@

 #ifdef HAS_ARGBAFFINEROW_SSE2

 // Copy ARGB pixels from source image with slope to a row of destination.

-__declspec(naked)

-LIBYUV_API

-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,

-                        uint8* dst_argb, const float* uv_dudv, int width) {

+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,

+                                                     int src_argb_stride,

+                                                     uint8_t* dst_argb,

+                                                     const float* uv_dudv,

+                                                     int width) {

   __asm {

     push       esi

     push       edi

@@ -5519,30 +5485,30 @@

     movq       xmm2, qword ptr [ecx]  // uv

     movq       xmm7, qword ptr [ecx + 8]  // dudv

     mov        ecx, [esp + 28]  // width

-    shl        esi, 16          // 4, stride

+    shl        esi, 16  // 4, stride

     add        esi, 4

     movd       xmm5, esi

     sub        ecx, 4

     jl         l4b

-    // setup for 4 pixel loop

+        // setup for 4 pixel loop

     pshufd     xmm7, xmm7, 0x44  // dup dudv

     pshufd     xmm5, xmm5, 0  // dup 4, stride

-    movdqa     xmm0, xmm2    // x0, y0, x1, y1

+    movdqa     xmm0, xmm2  // x0, y0, x1, y1

     addps      xmm0, xmm7

     movlhps    xmm2, xmm0

     movdqa     xmm4, xmm7

-    addps      xmm4, xmm4    // dudv *= 2

-    movdqa     xmm3, xmm2    // x2, y2, x3, y3

+    addps      xmm4, xmm4  // dudv *= 2

+    movdqa     xmm3, xmm2  // x2, y2, x3, y3

     addps      xmm3, xmm4

-    addps      xmm4, xmm4    // dudv *= 4

+    addps      xmm4, xmm4  // dudv *= 4

-    // 4 pixel loop

+        // 4 pixel loop

l4:

-    cvttps2dq  xmm0, xmm2    // x, y float to int first 2

-    cvttps2dq  xmm1, xmm3    // x, y float to int next 2

-    packssdw   xmm0, xmm1    // x, y as 8 shorts

-    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.

+    cvttps2dq  xmm0, xmm2  // x, y float to int first 2

+    cvttps2dq  xmm1, xmm3  // x, y float to int next 2

+    packssdw   xmm0, xmm1  // x, y as 8 shorts

+    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.

     movd       esi, xmm0

     pshufd     xmm0, xmm0, 0x39  // shift right

     movd       edi, xmm0

@@ -5549,8 +5515,8 @@

     pshufd     xmm0, xmm0, 0x39  // shift right

     movd       xmm1, [eax + esi]  // read pixel 0

     movd       xmm6, [eax + edi]  // read pixel 1

-    punpckldq  xmm1, xmm6     // combine pixel 0 and 1

-    addps      xmm2, xmm4    // x, y += dx, dy first 2

+    punpckldq  xmm1, xmm6  // combine pixel 0 and 1

+    addps      xmm2, xmm4  // x, y += dx, dy first 2

     movq       qword ptr [edx], xmm1

     movd       esi, xmm0

     pshufd     xmm0, xmm0, 0x39  // shift right

@@ -5557,8 +5523,8 @@

     movd       edi, xmm0

     movd       xmm6, [eax + esi]  // read pixel 2

     movd       xmm0, [eax + edi]  // read pixel 3

-    punpckldq  xmm6, xmm0     // combine pixel 2 and 3

-    addps      xmm3, xmm4    // x, y += dx, dy next 2

+    punpckldq  xmm6, xmm0  // combine pixel 2 and 3

+    addps      xmm3, xmm4  // x, y += dx, dy next 2

     movq       qword ptr 8[edx], xmm6

     lea        edx, [edx + 16]

     sub        ecx, 4

@@ -5568,12 +5534,12 @@

     add        ecx, 4 - 1

     jl         l1b

-    // 1 pixel loop

+        // 1 pixel loop

l1:

-    cvttps2dq  xmm0, xmm2    // x, y float to int

-    packssdw   xmm0, xmm0    // x, y as shorts

-    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride

-    addps      xmm2, xmm7    // x, y += dx, dy

+    cvttps2dq  xmm0, xmm2  // x, y float to int

+    packssdw   xmm0, xmm0  // x, y as shorts

+    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride

+    addps      xmm2, xmm7  // x, y += dx, dy

     movd       esi, xmm0

     movd       xmm0, [eax + esi]  // copy a pixel

     movd       [edx], xmm0

@@ -5590,15 +5556,16 @@

 #ifdef HAS_INTERPOLATEROW_AVX2

 // Bilinear filter 32x2 -> 32x1

-__declspec(naked)

-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,

-                         ptrdiff_t src_stride, int dst_width,

-                         int source_y_fraction) {

+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,

+                                           const uint8_t* src_ptr,

+                                           ptrdiff_t src_stride,

+                                           int dst_width,

+                                           int source_y_fraction) {

   __asm {

     push       esi

     push       edi

-    mov        edi, [esp + 8 + 4]   // dst_ptr

-    mov        esi, [esp + 8 + 8]   // src_ptr

+    mov        edi, [esp + 8 + 4]  // dst_ptr

+    mov        esi, [esp + 8 + 8]  // src_ptr

     mov        edx, [esp + 8 + 12]  // src_stride

     mov        ecx, [esp + 8 + 16]  // dst_width

     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

@@ -5607,7 +5574,7 @@

     je         xloop100  // 0 / 256.  Blend 100 / 0.

     sub        edi, esi

     cmp        eax, 128

-    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.

+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.

     vmovd      xmm0, eax  // high fraction 0..255

     neg        eax

@@ -5634,7 +5601,7 @@

     vpaddw     ymm0, ymm0, ymm4

     vpsrlw     ymm1, ymm1, 8

     vpsrlw     ymm0, ymm0, 8

-    vpackuswb  ymm0, ymm0, ymm1  // unmutates

+    vpackuswb  ymm0, ymm0, ymm1            // unmutates

     vmovdqu    [esi + edi], ymm0

     lea        esi, [esi + 32]

     sub        ecx, 32

@@ -5641,7 +5608,7 @@

     jg         xloop

     jmp        xloop99

-   // Blend 50 / 50.

+        // Blend 50 / 50.

  xloop50:

    vmovdqu    ymm0, [esi]

    vpavgb     ymm0, ymm0, [esi + edx]

@@ -5651,7 +5618,7 @@

    jg         xloop50

    jmp        xloop99

-   // Blend 100 / 0 - Copy row unchanged.

+        // Blend 100 / 0 - Copy row unchanged.

  xloop100:

    rep movsb

@@ -5666,25 +5633,26 @@

 // Bilinear filter 16x2 -> 16x1

 // TODO(fbarchard): Consider allowing 256 using memcpy.

-__declspec(naked)

-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                          ptrdiff_t src_stride, int dst_width,

-                          int source_y_fraction) {

+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,

+                                            const uint8_t* src_ptr,

+                                            ptrdiff_t src_stride,

+                                            int dst_width,

+                                            int source_y_fraction) {

   __asm {

     push       esi

     push       edi

-    mov        edi, [esp + 8 + 4]   // dst_ptr

-    mov        esi, [esp + 8 + 8]   // src_ptr

+    mov        edi, [esp + 8 + 4]  // dst_ptr

+    mov        esi, [esp + 8 + 8]  // src_ptr

     mov        edx, [esp + 8 + 12]  // src_stride

     mov        ecx, [esp + 8 + 16]  // dst_width

     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)

     sub        edi, esi

-    // Dispatch to specialized filters if applicable.

+        // Dispatch to specialized filters if applicable.

     cmp        eax, 0

     je         xloop100  // 0 /256.  Blend 100 / 0.

     cmp        eax, 128

-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.

+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.

     movd       xmm0, eax  // high fraction 0..255

     neg        eax

@@ -5703,7 +5671,7 @@

     movdqu     xmm1, xmm0

     punpcklbw  xmm0, xmm2

     punpckhbw  xmm1, xmm2

-    psubb      xmm0, xmm4  // bias image by -128

+    psubb      xmm0, xmm4            // bias image by -128

     psubb      xmm1, xmm4

     movdqa     xmm2, xmm5

     movdqa     xmm3, xmm5

@@ -5720,7 +5688,7 @@

     jg         xloop

     jmp        xloop99

-    // Blend 50 / 50.

+        // Blend 50 / 50.

   xloop50:

     movdqu     xmm0, [esi]

     movdqu     xmm1, [esi + edx]

@@ -5731,7 +5699,7 @@

     jg         xloop50

     jmp        xloop99

-    // Blend 100 / 0 - Copy row unchanged.

+        // Blend 100 / 0 - Copy row unchanged.

   xloop100:

     movdqu     xmm0, [esi]

     movdqu     [esi + edi], xmm0

@@ -5747,15 +5715,16 @@

 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.

-__declspec(naked)

-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                          const uint8* shuffler, int width) {

+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,

+                                            uint8_t* dst_argb,

+                                            const uint8_t* shuffler,

+                                            int width) {

   __asm {

-    mov        eax, [esp + 4]    // src_argb

-    mov        edx, [esp + 8]    // dst_argb

-    mov        ecx, [esp + 12]   // shuffler

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_argb

+    mov        ecx, [esp + 12]  // shuffler

     movdqu     xmm5, [ecx]

-    mov        ecx, [esp + 16]   // width

+    mov        ecx, [esp + 16]  // width

   wloop:

     movdqu     xmm0, [eax]

@@ -5773,15 +5742,16 @@

 #ifdef HAS_ARGBSHUFFLEROW_AVX2

-__declspec(naked)

-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width) {

+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,

+                                           uint8_t* dst_argb,

+                                           const uint8_t* shuffler,

+                                           int width) {

   __asm {

-    mov        eax, [esp + 4]     // src_argb

-    mov        edx, [esp + 8]     // dst_argb

-    mov        ecx, [esp + 12]    // shuffler

-    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.

-    mov        ecx, [esp + 16]    // width

+    mov        eax, [esp + 4]  // src_argb

+    mov        edx, [esp + 8]  // dst_argb

+    mov        ecx, [esp + 12]  // shuffler

+    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.

+    mov        ecx, [esp + 16]  // width

   wloop:

     vmovdqu    ymm0, [eax]

@@ -5801,122 +5771,6 @@

 #endif  // HAS_ARGBSHUFFLEROW_AVX2

-__declspec(naked)

-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,

-                         const uint8* shuffler, int width) {

-  __asm {

-    push       ebx

-    push       esi

-    mov        eax, [esp + 8 + 4]    // src_argb

-    mov        edx, [esp + 8 + 8]    // dst_argb

-    mov        esi, [esp + 8 + 12]   // shuffler

-    mov        ecx, [esp + 8 + 16]   // width

-    pxor       xmm5, xmm5

-    mov        ebx, [esi]   // shuffler

-    cmp        ebx, 0x03000102

-    je         shuf_3012

-    cmp        ebx, 0x00010203

-    je         shuf_0123

-    cmp        ebx, 0x00030201

-    je         shuf_0321

-    cmp        ebx, 0x02010003

-    je         shuf_2103

-  // TODO(fbarchard): Use one source pointer and 3 offsets.

-  shuf_any1:

-    movzx      ebx, byte ptr [esi]

-    movzx      ebx, byte ptr [eax + ebx]

-    mov        [edx], bl

-    movzx      ebx, byte ptr [esi + 1]

-    movzx      ebx, byte ptr [eax + ebx]

-    mov        [edx + 1], bl

-    movzx      ebx, byte ptr [esi + 2]

-    movzx      ebx, byte ptr [eax + ebx]

-    mov        [edx + 2], bl

-    movzx      ebx, byte ptr [esi + 3]

-    movzx      ebx, byte ptr [eax + ebx]

-    mov        [edx + 3], bl

-    lea        eax, [eax + 4]

-    lea        edx, [edx + 4]

-    sub        ecx, 1

-    jg         shuf_any1

-    jmp        shuf99

-  shuf_0123:

-    movdqu     xmm0, [eax]

-    lea        eax, [eax + 16]

-    movdqa     xmm1, xmm0

-    punpcklbw  xmm0, xmm5

-    punpckhbw  xmm1, xmm5

-    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB

-    pshuflw    xmm0, xmm0, 01Bh

-    pshufhw    xmm1, xmm1, 01Bh

-    pshuflw    xmm1, xmm1, 01Bh

-    packuswb   xmm0, xmm1

-    movdqu     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 4

-    jg         shuf_0123

-    jmp        shuf99

-  shuf_0321:

-    movdqu     xmm0, [eax]

-    lea        eax, [eax + 16]

-    movdqa     xmm1, xmm0

-    punpcklbw  xmm0, xmm5

-    punpckhbw  xmm1, xmm5

-    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB

-    pshuflw    xmm0, xmm0, 039h

-    pshufhw    xmm1, xmm1, 039h

-    pshuflw    xmm1, xmm1, 039h

-    packuswb   xmm0, xmm1

-    movdqu     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 4

-    jg         shuf_0321

-    jmp        shuf99

-  shuf_2103:

-    movdqu     xmm0, [eax]

-    lea        eax, [eax + 16]

-    movdqa     xmm1, xmm0

-    punpcklbw  xmm0, xmm5

-    punpckhbw  xmm1, xmm5

-    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA

-    pshuflw    xmm0, xmm0, 093h

-    pshufhw    xmm1, xmm1, 093h

-    pshuflw    xmm1, xmm1, 093h

-    packuswb   xmm0, xmm1

-    movdqu     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 4

-    jg         shuf_2103

-    jmp        shuf99

-  shuf_3012:

-    movdqu     xmm0, [eax]

-    lea        eax, [eax + 16]

-    movdqa     xmm1, xmm0

-    punpcklbw  xmm0, xmm5

-    punpckhbw  xmm1, xmm5

-    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB

-    pshuflw    xmm0, xmm0, 0C6h

-    pshufhw    xmm1, xmm1, 0C6h

-    pshuflw    xmm1, xmm1, 0C6h

-    packuswb   xmm0, xmm1

-    movdqu     [edx], xmm0

-    lea        edx, [edx + 16]

-    sub        ecx, 4

-    jg         shuf_3012

-  shuf99:

-    pop        esi

-    pop        ebx

-    ret

-  }

-}

 // YUY2 - Macro-pixel = 2 image pixels

 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....

@@ -5923,30 +5777,30 @@

 // UYVY - Macro-pixel = 2 image pixels

 // U0Y0V0Y1

-__declspec(naked)

-void I422ToYUY2Row_SSE2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_frame, int width) {

+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,

+                                          const uint8_t* src_u,

+                                          const uint8_t* src_v,

+                                          uint8_t* dst_frame,

+                                          int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_y

-    mov        esi, [esp + 8 + 8]    // src_u

-    mov        edx, [esp + 8 + 12]   // src_v

-    mov        edi, [esp + 8 + 16]   // dst_frame

-    mov        ecx, [esp + 8 + 20]   // width

+    mov        eax, [esp + 8 + 4]  // src_y

+    mov        esi, [esp + 8 + 8]  // src_u

+    mov        edx, [esp + 8 + 12]  // src_v

+    mov        edi, [esp + 8 + 16]  // dst_frame

+    mov        ecx, [esp + 8 + 20]  // width

     sub        edx, esi

   convertloop:

-    movq       xmm2, qword ptr [esi] // U

-    movq       xmm3, qword ptr [esi + edx] // V

+    movq       xmm2, qword ptr [esi]  // U

+    movq       xmm3, qword ptr [esi + edx]  // V

     lea        esi, [esi + 8]

-    punpcklbw  xmm2, xmm3 // UV

-    movdqu     xmm0, [eax] // Y

+    punpcklbw  xmm2, xmm3  // UV

+    movdqu     xmm0, [eax]  // Y

     lea        eax, [eax + 16]

     movdqa     xmm1, xmm0

-    punpcklbw  xmm0, xmm2 // YUYV

+    punpcklbw  xmm0, xmm2  // YUYV

     punpckhbw  xmm1, xmm2

     movdqu     [edi], xmm0

     movdqu     [edi + 16], xmm1

@@ -5960,30 +5814,30 @@

-__declspec(naked)

-void I422ToUYVYRow_SSE2(const uint8* src_y,

-                        const uint8* src_u,

-                        const uint8* src_v,

-                        uint8* dst_frame, int width) {

+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,

+                                          const uint8_t* src_u,

+                                          const uint8_t* src_v,

+                                          uint8_t* dst_frame,

+                                          int width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_y

-    mov        esi, [esp + 8 + 8]    // src_u

-    mov        edx, [esp + 8 + 12]   // src_v

-    mov        edi, [esp + 8 + 16]   // dst_frame

-    mov        ecx, [esp + 8 + 20]   // width

+    mov        eax, [esp + 8 + 4]  // src_y

+    mov        esi, [esp + 8 + 8]  // src_u

+    mov        edx, [esp + 8 + 12]  // src_v

+    mov        edi, [esp + 8 + 16]  // dst_frame

+    mov        ecx, [esp + 8 + 20]  // width

     sub        edx, esi

   convertloop:

-    movq       xmm2, qword ptr [esi] // U

-    movq       xmm3, qword ptr [esi + edx] // V

+    movq       xmm2, qword ptr [esi]  // U

+    movq       xmm3, qword ptr [esi + edx]  // V

     lea        esi, [esi + 8]

-    punpcklbw  xmm2, xmm3 // UV

-    movdqu     xmm0, [eax] // Y

+    punpcklbw  xmm2, xmm3  // UV

+    movdqu     xmm0, [eax]  // Y

     movdqa     xmm1, xmm2

     lea        eax, [eax + 16]

-    punpcklbw  xmm1, xmm0 // UYVY

+    punpcklbw  xmm1, xmm0  // UYVY

     punpckhbw  xmm2, xmm0

     movdqu     [edi], xmm1

     movdqu     [edi + 16], xmm2

@@ -5998,22 +5852,22 @@

 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2

-__declspec(naked)

-void ARGBPolynomialRow_SSE2(const uint8* src_argb,

-                            uint8* dst_argb, const float* poly,

-                            int width) {

+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,

+                                              uint8_t* dst_argb,

+                                              const float* poly,

+                                              int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   /* src_argb */

-    mov        edx, [esp + 4 + 8]   /* dst_argb */

-    mov        esi, [esp + 4 + 12]  /* poly */

-    mov        ecx, [esp + 4 + 16]  /* width */

+    mov        eax, [esp + 4 + 4] /* src_argb */

+    mov        edx, [esp + 4 + 8] /* dst_argb */

+    mov        esi, [esp + 4 + 12] /* poly */

+    mov        ecx, [esp + 4 + 16] /* width */

     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.

-    // 2 pixel loop.

+        // 2 pixel loop.

  convertloop:

-//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel

-//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel

+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel

+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel

     movq       xmm0, qword ptr [eax]  // BGRABGRA

     lea        eax, [eax + 8]

     punpcklbw  xmm0, xmm3

@@ -6057,25 +5911,25 @@

 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2

 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2

-__declspec(naked)

-void ARGBPolynomialRow_AVX2(const uint8* src_argb,

-                            uint8* dst_argb, const float* poly,

-                            int width) {

+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,

+                                              uint8_t* dst_argb,

+                                              const float* poly,

+                                              int width) {

   __asm {

-    mov        eax, [esp + 4]   /* src_argb */

-    mov        edx, [esp + 8]   /* dst_argb */

-    mov        ecx, [esp + 12]   /* poly */

-    vbroadcastf128 ymm4, [ecx]       // C0

+    mov        eax, [esp + 4] /* src_argb */

+    mov        edx, [esp + 8] /* dst_argb */

+    mov        ecx, [esp + 12] /* poly */

+    vbroadcastf128 ymm4, [ecx]  // C0

     vbroadcastf128 ymm5, [ecx + 16]  // C1

     vbroadcastf128 ymm6, [ecx + 32]  // C2

     vbroadcastf128 ymm7, [ecx + 48]  // C3

-    mov        ecx, [esp + 16]  /* width */

+    mov        ecx, [esp + 16] /* width */

     // 2 pixel loop.

  convertloop:

     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels

     lea         eax, [eax + 8]

-    vcvtdq2ps   ymm0, ymm0        // X 8 floats

+    vcvtdq2ps   ymm0, ymm0  // X 8 floats

     vmulps      ymm2, ymm0, ymm0  // X * X

     vmulps      ymm3, ymm0, ymm7  // C3 * X

     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X

@@ -6095,16 +5949,125 @@

 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2

+#ifdef HAS_HALFFLOATROW_SSE2

+static float kExpBias = 1.9259299444e-34f;

+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,

+                                         uint16_t* dst,

+                                         float scale,

+                                         int width) {

+  __asm {

+    mov        eax, [esp + 4] /* src */

+    mov        edx, [esp + 8] /* dst */

+    movd       xmm4, dword ptr [esp + 12] /* scale */

+    mov        ecx, [esp + 16] /* width */

+    mulss      xmm4, kExpBias

+    pshufd     xmm4, xmm4, 0

+    pxor       xmm5, xmm5

+    sub        edx, eax

+        // 8 pixel loop.

+ convertloop:

+    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts

+    add         eax, 16

+    movdqa      xmm3, xmm2

+    punpcklwd   xmm2, xmm5

+    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats

+    punpckhwd   xmm3, xmm5

+    cvtdq2ps    xmm3, xmm3

+    mulps       xmm2, xmm4

+    mulps       xmm3, xmm4

+    psrld       xmm2, 13

+    psrld       xmm3, 13

+    packssdw    xmm2, xmm3

+    movdqu      [eax + edx - 16], xmm2

+    sub         ecx, 8

+    jg          convertloop

+    ret

+  }

+}

+#endif  // HAS_HALFFLOATROW_SSE2

+#ifdef HAS_HALFFLOATROW_AVX2

+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,

+                                         uint16_t* dst,

+                                         float scale,

+                                         int width) {

+  __asm {

+    mov        eax, [esp + 4] /* src */

+    mov        edx, [esp + 8] /* dst */

+    movd       xmm4, dword ptr [esp + 12] /* scale */

+    mov        ecx, [esp + 16] /* width */

+    vmulss     xmm4, xmm4, kExpBias

+    vbroadcastss ymm4, xmm4

+    vpxor      ymm5, ymm5, ymm5

+    sub        edx, eax

+        // 16 pixel loop.

+ convertloop:

+    vmovdqu     ymm2, [eax]  // 16 shorts

+    add         eax, 32

+    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints

+    vpunpcklwd  ymm2, ymm2, ymm5

+    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats

+    vcvtdq2ps   ymm2, ymm2

+    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.

+    vmulps      ymm2, ymm2, ymm4

+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate

+    vpsrld      ymm2, ymm2, 13

+    vpackssdw   ymm2, ymm2, ymm3

+    vmovdqu     [eax + edx - 32], ymm2

+    sub         ecx, 16

+    jg          convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_HALFFLOATROW_AVX2

+#ifdef HAS_HALFFLOATROW_F16C

+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,

+                                         uint16_t* dst,

+                                         float scale,

+                                         int width) {

+  __asm {

+    mov        eax, [esp + 4] /* src */

+    mov        edx, [esp + 8] /* dst */

+    vbroadcastss ymm4, [esp + 12] /* scale */

+    mov        ecx, [esp + 16] /* width */

+    sub        edx, eax

+        // 16 pixel loop.

+ convertloop:

+    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints

+    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts

+    add         eax, 32

+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats

+    vcvtdq2ps   ymm3, ymm3

+    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1

+    vmulps      ymm3, ymm3, ymm4

+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate

+    vcvtps2ph   xmm3, ymm3, 3

+    vmovdqu     [eax + edx + 32], xmm2

+    vmovdqu     [eax + edx + 32 + 16], xmm3

+    sub         ecx, 16

+    jg          convertloop

+    vzeroupper

+    ret

+  }

+}

+#endif  // HAS_HALFFLOATROW_F16C

 #ifdef HAS_ARGBCOLORTABLEROW_X86

 // Tranform ARGB pixels with color table.

-__declspec(naked)

-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,

-                           int width) {

+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,

+                                             const uint8_t* table_argb,

+                                             int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   /* dst_argb */

-    mov        esi, [esp + 4 + 8]   /* table_argb */

-    mov        ecx, [esp + 4 + 12]  /* width */

+    mov        eax, [esp + 4 + 4] /* dst_argb */

+    mov        esi, [esp + 4 + 8] /* table_argb */

+    mov        ecx, [esp + 4 + 12] /* width */

     // 1 pixel loop.

   convertloop:

@@ -6131,13 +6094,14 @@

 #ifdef HAS_RGBCOLORTABLEROW_X86

 // Tranform RGB pixels with color table.

-__declspec(naked)

-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {

+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,

+                                            const uint8_t* table_argb,

+                                            int width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]   /* dst_argb */

-    mov        esi, [esp + 4 + 8]   /* table_argb */

-    mov        ecx, [esp + 4 + 12]  /* width */

+    mov        eax, [esp + 4 + 4] /* dst_argb */

+    mov        esi, [esp + 4 + 8] /* table_argb */

+    mov        ecx, [esp + 4 + 12] /* width */

     // 1 pixel loop.

   convertloop:

@@ -6162,27 +6126,28 @@

 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3

 // Tranform RGB pixels with luma table.

-__declspec(naked)

-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,

-                                 int width,

-                                 const uint8* luma, uint32 lumacoeff) {

+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,

+                                                   uint8_t* dst_argb,

+                                                   int width,

+                                                   const uint8_t* luma,

+                                                   uint32_t lumacoeff) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]   /* src_argb */

-    mov        edi, [esp + 8 + 8]   /* dst_argb */

-    mov        ecx, [esp + 8 + 12]  /* width */

+    mov        eax, [esp + 8 + 4] /* src_argb */

+    mov        edi, [esp + 8 + 8] /* dst_argb */

+    mov        ecx, [esp + 8 + 12] /* width */

     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table

     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff

     pshufd     xmm2, xmm2, 0

     pshufd     xmm3, xmm3, 0

-    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00

+    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00

     psllw      xmm4, 8

     pxor       xmm5, xmm5

-    // 4 pixel loop.

+        // 4 pixel loop.

   convertloop:

-    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr

+    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr

     pmaddubsw  xmm0, xmm3

     phaddw     xmm0, xmm0

     pand       xmm0, xmm4  // mask out low bits

--- a/third_party/libyuv/source/scale.cc

+++ b/third_party/libyuv/source/scale.cc

@@ -33,17 +33,25 @@

 // This is an optimized version for scaling down a plane to 1/2 of

 // its original size.

-static void ScalePlaneDown2(int src_width, int src_height,

-                            int dst_width, int dst_height,

-                            int src_stride, int dst_stride,

-                            const uint8* src_ptr, uint8* dst_ptr,

+static void ScalePlaneDown2(int src_width,

+                            int src_height,

+                            int dst_width,

+                            int dst_height,

+                            int src_stride,

+                            int dst_stride,

+                            const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

                             enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) =

-      filtering == kFilterNone ? ScaleRowDown2_C :

-      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);

+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,

+                        uint8_t* dst_ptr, int dst_width) =

+      filtering == kFilterNone

+          ? ScaleRowDown2_C

+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C

+                                        : ScaleRowDown2Box_C);

   int row_stride = src_stride << 1;

+  (void)src_width;

+  (void)src_height;

   if (!filtering) {

     src_ptr += src_stride;  // Point to odd rows.

     src_stride = 0;

@@ -51,46 +59,63 @@

 #if defined(HAS_SCALEROWDOWN2_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :

-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :

-        ScaleRowDown2Box_Any_NEON);

+    ScaleRowDown2 =

+        filtering == kFilterNone

+            ? ScaleRowDown2_Any_NEON

+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON

+                                          : ScaleRowDown2Box_Any_NEON);

     if (IS_ALIGNED(dst_width, 16)) {

-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :

-          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :

-          ScaleRowDown2Box_NEON);

+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON

+                                               : (filtering == kFilterLinear

+                                                      ? ScaleRowDown2Linear_NEON

+                                                      : ScaleRowDown2Box_NEON);

 #endif

 #if defined(HAS_SCALEROWDOWN2_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :

-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :

-        ScaleRowDown2Box_Any_SSSE3);

+    ScaleRowDown2 =

+        filtering == kFilterNone

+            ? ScaleRowDown2_Any_SSSE3

+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3

+                                          : ScaleRowDown2Box_Any_SSSE3);

     if (IS_ALIGNED(dst_width, 16)) {

-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :

-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :

-          ScaleRowDown2Box_SSSE3);

+      ScaleRowDown2 =

+          filtering == kFilterNone

+              ? ScaleRowDown2_SSSE3

+              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3

+                                            : ScaleRowDown2Box_SSSE3);

 #endif

 #if defined(HAS_SCALEROWDOWN2_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :

-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :

-        ScaleRowDown2Box_Any_AVX2);

+    ScaleRowDown2 =

+        filtering == kFilterNone

+            ? ScaleRowDown2_Any_AVX2

+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2

+                                          : ScaleRowDown2Box_Any_AVX2);

     if (IS_ALIGNED(dst_width, 32)) {

-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :

-          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :

-          ScaleRowDown2Box_AVX2);

+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2

+                                               : (filtering == kFilterLinear

+                                                      ? ScaleRowDown2Linear_AVX2

+                                                      : ScaleRowDown2Box_AVX2);

 #endif

-#if defined(HAS_SCALEROWDOWN2_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&

-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    ScaleRowDown2 = filtering ?

-        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;

+#if defined(HAS_SCALEROWDOWN2_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ScaleRowDown2 =

+        filtering == kFilterNone

+            ? ScaleRowDown2_Any_MSA

+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA

+                                          : ScaleRowDown2Box_Any_MSA);

+    if (IS_ALIGNED(dst_width, 32)) {

+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA

+                                               : (filtering == kFilterLinear

+                                                      ? ScaleRowDown2Linear_MSA

+                                                      : ScaleRowDown2Box_MSA);

+    }

 #endif

@@ -105,18 +130,25 @@

-static void ScalePlaneDown2_16(int src_width, int src_height,

-                               int dst_width, int dst_height,

-                               int src_stride, int dst_stride,

-                               const uint16* src_ptr, uint16* dst_ptr,

+static void ScalePlaneDown2_16(int src_width,

+                               int src_height,

+                               int dst_width,

+                               int dst_height,

+                               int src_stride,

+                               int dst_stride,

+                               const uint16_t* src_ptr,

+                               uint16_t* dst_ptr,

                                enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,

-                        uint16* dst_ptr, int dst_width) =

-    filtering == kFilterNone ? ScaleRowDown2_16_C :

-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :

-        ScaleRowDown2Box_16_C);

+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,

+                        uint16_t* dst_ptr, int dst_width) =

+      filtering == kFilterNone

+          ? ScaleRowDown2_16_C

+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C

+                                        : ScaleRowDown2Box_16_C);

   int row_stride = src_stride << 1;

+  (void)src_width;

+  (void)src_height;

   if (!filtering) {

     src_ptr += src_stride;  // Point to odd rows.

     src_stride = 0;

@@ -124,25 +156,19 @@

 #if defined(HAS_SCALEROWDOWN2_16_NEON)

   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {

-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :

-        ScaleRowDown2_16_NEON;

+    ScaleRowDown2 =

+        filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;

 #endif

 #if defined(HAS_SCALEROWDOWN2_16_SSE2)

   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {

-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :

-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :

-        ScaleRowDown2Box_16_SSE2);

+    ScaleRowDown2 =

+        filtering == kFilterNone

+            ? ScaleRowDown2_16_SSE2

+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2

+                                          : ScaleRowDown2Box_16_SSE2);

 #endif

-#if defined(HAS_SCALEROWDOWN2_16_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&

-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    ScaleRowDown2 = filtering ?

-        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;

-  }

-#endif

   if (filtering == kFilterLinear) {

     src_stride = 0;

@@ -159,16 +185,22 @@

 // This is an optimized version for scaling down a plane to 1/4 of

 // its original size.

-static void ScalePlaneDown4(int src_width, int src_height,

-                            int dst_width, int dst_height,

-                            int src_stride, int dst_stride,

-                            const uint8* src_ptr, uint8* dst_ptr,

+static void ScalePlaneDown4(int src_width,

+                            int src_height,

+                            int dst_width,

+                            int dst_height,

+                            int src_stride,

+                            int dst_stride,

+                            const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

                             enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) =

+  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,

+                        uint8_t* dst_ptr, int dst_width) =

       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;

   int row_stride = src_stride << 2;

+  (void)src_width;

+  (void)src_height;

   if (!filtering) {

     src_ptr += src_stride * 2;  // Point to row 2.

     src_stride = 0;

@@ -175,8 +207,8 @@

 #if defined(HAS_SCALEROWDOWN4_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;

+    ScaleRowDown4 =

+        filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;

     if (IS_ALIGNED(dst_width, 8)) {

       ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;

@@ -184,8 +216,8 @@

 #endif

 #if defined(HAS_SCALEROWDOWN4_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

-    ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;

+    ScaleRowDown4 =

+        filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;

     if (IS_ALIGNED(dst_width, 8)) {

       ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;

@@ -193,19 +225,20 @@

 #endif

 #if defined(HAS_SCALEROWDOWN4_AVX2)

   if (TestCpuFlag(kCpuHasAVX2)) {

-    ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;

+    ScaleRowDown4 =

+        filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;

     if (IS_ALIGNED(dst_width, 16)) {

       ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;

 #endif

-#if defined(HAS_SCALEROWDOWN4_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&

-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;

+#if defined(HAS_SCALEROWDOWN4_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ScaleRowDown4 =

+        filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;

+    if (IS_ALIGNED(dst_width, 16)) {

+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;

+    }

 #endif

@@ -219,16 +252,22 @@

-static void ScalePlaneDown4_16(int src_width, int src_height,

-                               int dst_width, int dst_height,

-                               int src_stride, int dst_stride,

-                               const uint16* src_ptr, uint16* dst_ptr,

+static void ScalePlaneDown4_16(int src_width,

+                               int src_height,

+                               int dst_width,

+                               int dst_height,

+                               int src_stride,

+                               int dst_stride,

+                               const uint16_t* src_ptr,

+                               uint16_t* dst_ptr,

                                enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,

-                        uint16* dst_ptr, int dst_width) =

+  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,

+                        uint16_t* dst_ptr, int dst_width) =

       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;

   int row_stride = src_stride << 2;

+  (void)src_width;

+  (void)src_height;

   if (!filtering) {

     src_ptr += src_stride * 2;  // Point to row 2.

     src_stride = 0;

@@ -235,24 +274,16 @@

 #if defined(HAS_SCALEROWDOWN4_16_NEON)

   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {

-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :

-        ScaleRowDown4_16_NEON;

+    ScaleRowDown4 =

+        filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;

 #endif

 #if defined(HAS_SCALEROWDOWN4_16_SSE2)

   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {

-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :

-        ScaleRowDown4_16_SSE2;

+    ScaleRowDown4 =

+        filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;

 #endif

-#if defined(HAS_SCALEROWDOWN4_16_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&

-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    ScaleRowDown4 = filtering ?

-        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;

-  }

-#endif

   if (filtering == kFilterLinear) {

     src_stride = 0;

@@ -265,18 +296,23 @@

 // Scale plane down, 3/4

-static void ScalePlaneDown34(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint8* src_ptr, uint8* dst_ptr,

+static void ScalePlaneDown34(int src_width,

+                             int src_height,

+                             int dst_width,

+                             int dst_height,

+                             int src_stride,

+                             int dst_stride,

+                             const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

                              enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

-  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

+  void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,

+                           uint8_t* dst_ptr, int dst_width);

+  void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,

+                           uint8_t* dst_ptr, int dst_width);

   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

+  (void)src_width;

+  (void)src_height;

   assert(dst_width % 3 == 0);

   if (!filtering) {

     ScaleRowDown34_0 = ScaleRowDown34_C;

@@ -305,6 +341,26 @@

 #endif

+#if defined(HAS_SCALEROWDOWN34_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    if (!filtering) {

+      ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;

+      ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;

+    } else {

+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;

+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;

+    }

+    if (dst_width % 48 == 0) {

+      if (!filtering) {

+        ScaleRowDown34_0 = ScaleRowDown34_MSA;

+        ScaleRowDown34_1 = ScaleRowDown34_MSA;

+      } else {

+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;

+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;

+      }

+    }

+  }

+#endif

 #if defined(HAS_SCALEROWDOWN34_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     if (!filtering) {

@@ -325,19 +381,6 @@

 #endif

-#if defined(HAS_SCALEROWDOWN34_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&

-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    if (!filtering) {

-      ScaleRowDown34_0 = ScaleRowDown34_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_DSPR2;

-    } else {

-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;

-    }

-  }

-#endif

   for (y = 0; y < dst_height - 2; y += 3) {

     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);

@@ -346,8 +389,7 @@

     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);

     src_ptr += src_stride;

     dst_ptr += dst_stride;

-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,

-                     dst_ptr, dst_width);

+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);

     src_ptr += src_stride * 2;

     dst_ptr += dst_stride;

@@ -363,17 +405,23 @@

-static void ScalePlaneDown34_16(int src_width, int src_height,

-                                int dst_width, int dst_height,

-                                int src_stride, int dst_stride,

-                                const uint16* src_ptr, uint16* dst_ptr,

+static void ScalePlaneDown34_16(int src_width,

+                                int src_height,

+                                int dst_width,

+                                int dst_height,

+                                int src_stride,

+                                int dst_stride,

+                                const uint16_t* src_ptr,

+                                uint16_t* dst_ptr,

                                 enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst_ptr, int dst_width);

-  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst_ptr, int dst_width);

+  void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,

+                           uint16_t* dst_ptr, int dst_width);

+  void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,

+                           uint16_t* dst_ptr, int dst_width);

   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

+  (void)src_width;

+  (void)src_height;

   assert(dst_width % 3 == 0);

   if (!filtering) {

     ScaleRowDown34_0 = ScaleRowDown34_16_C;

@@ -404,19 +452,6 @@

 #endif

-#if defined(HAS_SCALEROWDOWN34_16_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&

-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    if (!filtering) {

-      ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;

-    } else {

-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;

-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;

-    }

-  }

-#endif

   for (y = 0; y < dst_height - 2; y += 3) {

     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);

@@ -425,8 +460,7 @@

     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);

     src_ptr += src_stride;

     dst_ptr += dst_stride;

-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,

-                     dst_ptr, dst_width);

+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);

     src_ptr += src_stride * 2;

     dst_ptr += dst_stride;

@@ -442,7 +476,6 @@

 // Scale plane, 3/8

 // This is an optimized version for scaling down a plane to 3/8

 // of its original size.

@@ -458,18 +491,24 @@

 // ggghhhii

 // Boxes are 3x3, 2x3, 3x2 and 2x2

-static void ScalePlaneDown38(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint8* src_ptr, uint8* dst_ptr,

+static void ScalePlaneDown38(int src_width,

+                             int src_height,

+                             int dst_width,

+                             int dst_height,

+                             int src_stride,

+                             int dst_stride,

+                             const uint8_t* src_ptr,

+                             uint8_t* dst_ptr,

                              enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

-  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width);

+  void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,

+                           uint8_t* dst_ptr, int dst_width);

+  void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,

+                           uint8_t* dst_ptr, int dst_width);

   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

   assert(dst_width % 3 == 0);

+  (void)src_width;

+  (void)src_height;

   if (!filtering) {

     ScaleRowDown38_3 = ScaleRowDown38_C;

     ScaleRowDown38_2 = ScaleRowDown38_C;

@@ -517,17 +556,24 @@

 #endif

-#if defined(HAS_SCALEROWDOWN38_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&

-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

+#if defined(HAS_SCALEROWDOWN38_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

     if (!filtering) {

-      ScaleRowDown38_3 = ScaleRowDown38_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_DSPR2;

+      ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;

+      ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;

     } else {

-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;

+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;

+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;

+    if (dst_width % 12 == 0) {

+      if (!filtering) {

+        ScaleRowDown38_3 = ScaleRowDown38_MSA;

+        ScaleRowDown38_2 = ScaleRowDown38_MSA;

+      } else {

+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;

+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;

+      }

+    }

 #endif

@@ -554,17 +600,23 @@

-static void ScalePlaneDown38_16(int src_width, int src_height,

-                                int dst_width, int dst_height,

-                                int src_stride, int dst_stride,

-                                const uint16* src_ptr, uint16* dst_ptr,

+static void ScalePlaneDown38_16(int src_width,

+                                int src_height,

+                                int dst_width,

+                                int dst_height,

+                                int src_stride,

+                                int dst_stride,

+                                const uint16_t* src_ptr,

+                                uint16_t* dst_ptr,

                                 enum FilterMode filtering) {

   int y;

-  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst_ptr, int dst_width);

-  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst_ptr, int dst_width);

+  void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,

+                           uint16_t* dst_ptr, int dst_width);

+  void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,

+                           uint16_t* dst_ptr, int dst_width);

   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;

+  (void)src_width;

+  (void)src_height;

   assert(dst_width % 3 == 0);

   if (!filtering) {

     ScaleRowDown38_3 = ScaleRowDown38_16_C;

@@ -595,19 +647,6 @@

 #endif

-#if defined(HAS_SCALEROWDOWN38_16_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&

-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {

-    if (!filtering) {

-      ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;

-    } else {

-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;

-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;

-    }

-  }

-#endif

   for (y = 0; y < dst_height - 2; y += 3) {

     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);

@@ -634,8 +673,8 @@

 #define MIN1(x) ((x) < 1 ? 1 : (x))

-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {

-  uint32 sum = 0u;

+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {

+  uint32_t sum = 0u;

   int x;

   assert(iboxwidth > 0);

   for (x = 0; x < iboxwidth; ++x) {

@@ -644,8 +683,8 @@

   return sum;

-static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {

-  uint32 sum = 0u;

+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {

+  uint32_t sum = 0u;

   int x;

   assert(iboxwidth > 0);

   for (x = 0; x < iboxwidth; ++x) {

@@ -654,8 +693,12 @@

   return sum;

-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,

-                            const uint16* src_ptr, uint8* dst_ptr) {

+static void ScaleAddCols2_C(int dst_width,

+                            int boxheight,

+                            int x,

+                            int dx,

+                            const uint16_t* src_ptr,

+                            uint8_t* dst_ptr) {

   int i;

   int scaletbl[2];

   int minboxwidth = dx >> 16;

@@ -666,13 +709,18 @@

     int ix = x >> 16;

     x += dx;

     boxwidth = MIN1((x >> 16) - ix);

-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *

-        scaletbl[boxwidth - minboxwidth] >> 16;

+    *dst_ptr++ =

+        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>

+        16;

-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,

-                               const uint32* src_ptr, uint16* dst_ptr) {

+static void ScaleAddCols2_16_C(int dst_width,

+                               int boxheight,

+                               int x,

+                               int dx,

+                               const uint32_t* src_ptr,

+                               uint16_t* dst_ptr) {

   int i;

   int scaletbl[2];

   int minboxwidth = dx >> 16;

@@ -684,14 +732,20 @@

     x += dx;

     boxwidth = MIN1((x >> 16) - ix);

     *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *

-        scaletbl[boxwidth - minboxwidth]  >> 16;

+                     scaletbl[boxwidth - minboxwidth] >>

+                 16;

-static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,

-                            const uint16* src_ptr, uint8* dst_ptr) {

+static void ScaleAddCols0_C(int dst_width,

+                            int boxheight,

+                            int x,

+                            int dx,

+                            const uint16_t* src_ptr,

+                            uint8_t* dst_ptr) {

   int scaleval = 65536 / boxheight;

   int i;

+  (void)dx;

   src_ptr += (x >> 16);

   for (i = 0; i < dst_width; ++i) {

     *dst_ptr++ = src_ptr[i] * scaleval >> 16;

@@ -698,8 +752,12 @@

-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,

-                            const uint16* src_ptr, uint8* dst_ptr) {

+static void ScaleAddCols1_C(int dst_width,

+                            int boxheight,

+                            int x,

+                            int dx,

+                            const uint16_t* src_ptr,

+                            uint8_t* dst_ptr) {

   int boxwidth = MIN1(dx >> 16);

   int scaleval = 65536 / (boxwidth * boxheight);

   int i;

@@ -710,8 +768,12 @@

-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,

-                               const uint32* src_ptr, uint16* dst_ptr) {

+static void ScaleAddCols1_16_C(int dst_width,

+                               int boxheight,

+                               int x,

+                               int dx,

+                               const uint32_t* src_ptr,

+                               uint16_t* dst_ptr) {

   int boxwidth = MIN1(dx >> 16);

   int scaleval = 65536 / (boxwidth * boxheight);

   int i;

@@ -728,10 +790,14 @@

 // one pixel of destination using fixed point (16.16) to step

 // through source, sampling a box of pixel with simple

 // averaging.

-static void ScalePlaneBox(int src_width, int src_height,

-                          int dst_width, int dst_height,

-                          int src_stride, int dst_stride,

-                          const uint8* src_ptr, uint8* dst_ptr) {

+static void ScalePlaneBox(int src_width,

+                          int src_height,

+                          int dst_width,

+                          int dst_height,

+                          int src_stride,

+                          int dst_stride,

+                          const uint8_t* src_ptr,

+                          uint8_t* dst_ptr) {

   int j, k;

   // Initial source x/y coordinate and step values as 16.16 fixed point.

   int x = 0;

@@ -739,18 +805,18 @@

   int dx = 0;

   int dy = 0;

   const int max_y = (src_height << 16);

-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,

-             &x, &y, &dx, &dy);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

-    // Allocate a row buffer of uint16.

+    // Allocate a row buffer of uint16_t.

     align_buffer_64(row16, src_width * 2);

     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,

-        const uint16* src_ptr, uint8* dst_ptr) =

-        (dx & 0xffff) ? ScaleAddCols2_C:

-        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);

-    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =

-        ScaleAddRow_C;

+                         const uint16_t* src_ptr, uint8_t* dst_ptr) =

+        (dx & 0xffff) ? ScaleAddCols2_C

+                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);

+    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,

+                        int src_width) = ScaleAddRow_C;

 #if defined(HAS_SCALEADDROW_SSE2)

     if (TestCpuFlag(kCpuHasSSE2)) {

       ScaleAddRow = ScaleAddRow_Any_SSE2;

@@ -775,11 +841,19 @@

 #endif

+#if defined(HAS_SCALEADDROW_MSA)

+    if (TestCpuFlag(kCpuHasMSA)) {

+      ScaleAddRow = ScaleAddRow_Any_MSA;

+      if (IS_ALIGNED(src_width, 16)) {

+        ScaleAddRow = ScaleAddRow_MSA;

+      }

+    }

+#endif

     for (j = 0; j < dst_height; ++j) {

       int boxheight;

       int iy = y >> 16;

-      const uint8* src = src_ptr + iy * src_stride;

+      const uint8_t* src = src_ptr + iy * src_stride;

       y += dy;

       if (y > max_y) {

         y = max_y;

@@ -787,10 +861,10 @@

       boxheight = MIN1((y >> 16) - iy);

       memset(row16, 0, src_width * 2);

       for (k = 0; k < boxheight; ++k) {

-        ScaleAddRow(src, (uint16 *)(row16), src_width);

+        ScaleAddRow(src, (uint16_t*)(row16), src_width);

         src += src_stride;

-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);

+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);

       dst_ptr += dst_stride;

     free_aligned_buffer_64(row16);

@@ -797,10 +871,14 @@

-static void ScalePlaneBox_16(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint16* src_ptr, uint16* dst_ptr) {

+static void ScalePlaneBox_16(int src_width,

+                             int src_height,

+                             int dst_width,

+                             int dst_height,

+                             int src_stride,

+                             int dst_stride,

+                             const uint16_t* src_ptr,

+                             uint16_t* dst_ptr) {

   int j, k;

   // Initial source x/y coordinate and step values as 16.16 fixed point.

   int x = 0;

@@ -808,17 +886,17 @@

   int dx = 0;

   int dy = 0;

   const int max_y = (src_height << 16);

-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,

-             &x, &y, &dx, &dy);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

-    // Allocate a row buffer of uint32.

+    // Allocate a row buffer of uint32_t.

     align_buffer_64(row32, src_width * 4);

     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,

-        const uint32* src_ptr, uint16* dst_ptr) =

-        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;

-    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =

-        ScaleAddRow_16_C;

+                         const uint32_t* src_ptr, uint16_t* dst_ptr) =

+        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;

+    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,

+                        int src_width) = ScaleAddRow_16_C;

 #if defined(HAS_SCALEADDROW_16_SSE2)

     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {

@@ -829,7 +907,7 @@

     for (j = 0; j < dst_height; ++j) {

       int boxheight;

       int iy = y >> 16;

-      const uint16* src = src_ptr + iy * src_stride;

+      const uint16_t* src = src_ptr + iy * src_stride;

       y += dy;

       if (y > max_y) {

         y = max_y;

@@ -837,10 +915,10 @@

       boxheight = MIN1((y >> 16) - iy);

       memset(row32, 0, src_width * 4);

       for (k = 0; k < boxheight; ++k) {

-        ScaleAddRow(src, (uint32 *)(row32), src_width);

+        ScaleAddRow(src, (uint32_t*)(row32), src_width);

         src += src_stride;

-      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);

+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);

       dst_ptr += dst_stride;

     free_aligned_buffer_64(row32);

@@ -848,10 +926,14 @@

 // Scale plane down with bilinear interpolation.

-void ScalePlaneBilinearDown(int src_width, int src_height,

-                            int dst_width, int dst_height,

-                            int src_stride, int dst_stride,

-                            const uint8* src_ptr, uint8* dst_ptr,

+void ScalePlaneBilinearDown(int src_width,

+                            int src_height,

+                            int dst_width,

+                            int dst_height,

+                            int src_stride,

+                            int dst_stride,

+                            const uint8_t* src_ptr,

+                            uint8_t* dst_ptr,

                             enum FilterMode filtering) {

   // Initial source x/y coordinate and step values as 16.16 fixed point.

   int x = 0;

@@ -864,14 +946,14 @@

   const int max_y = (src_height - 1) << 16;

   int j;

-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,

-      int dst_width, int x, int dx) =

+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,

+                          int dst_width, int x, int dx) =

       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;

-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_C;

-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

-             &x, &y, &dx, &dy);

+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_C;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

 #if defined(HAS_INTERPOLATEROW_SSSE3)

@@ -898,16 +980,15 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2)) {

-    InterpolateRow = InterpolateRow_Any_DSPR2;

-    if (IS_ALIGNED(src_width, 4)) {

-      InterpolateRow = InterpolateRow_DSPR2;

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(src_width, 32)) {

+      InterpolateRow = InterpolateRow_MSA;

 #endif

 #if defined(HAS_SCALEFILTERCOLS_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

     ScaleFilterCols = ScaleFilterCols_SSSE3;

@@ -921,6 +1002,14 @@

 #endif

+#if defined(HAS_SCALEFILTERCOLS_MSA)

+  if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {

+    ScaleFilterCols = ScaleFilterCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 16)) {

+      ScaleFilterCols = ScaleFilterCols_MSA;

+    }

+  }

+#endif

   if (y > max_y) {

     y = max_y;

@@ -927,7 +1016,7 @@

   for (j = 0; j < dst_height; ++j) {

     int yi = y >> 16;

-    const uint8* src = src_ptr + yi * src_stride;

+    const uint8_t* src = src_ptr + yi * src_stride;

     if (filtering == kFilterLinear) {

       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);

     } else {

@@ -944,10 +1033,14 @@

   free_aligned_buffer_64(row);

-void ScalePlaneBilinearDown_16(int src_width, int src_height,

-                               int dst_width, int dst_height,

-                               int src_stride, int dst_stride,

-                               const uint16* src_ptr, uint16* dst_ptr,

+void ScalePlaneBilinearDown_16(int src_width,

+                               int src_height,

+                               int dst_width,

+                               int dst_height,

+                               int src_stride,

+                               int dst_stride,

+                               const uint16_t* src_ptr,

+                               uint16_t* dst_ptr,

                                enum FilterMode filtering) {

   // Initial source x/y coordinate and step values as 16.16 fixed point.

   int x = 0;

@@ -960,14 +1053,14 @@

   const int max_y = (src_height - 1) << 16;

   int j;

-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,

-      int dst_width, int x, int dx) =

+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,

+                          int dst_width, int x, int dx) =

       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;

-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_16_C;

-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

-             &x, &y, &dx, &dy);

+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_16_C;

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

 #if defined(HAS_INTERPOLATEROW_16_SSE2)

@@ -1002,16 +1095,7 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_16_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2)) {

-    InterpolateRow = InterpolateRow_Any_16_DSPR2;

-    if (IS_ALIGNED(src_width, 4)) {

-      InterpolateRow = InterpolateRow_16_DSPR2;

-    }

-  }

-#endif

 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

     ScaleFilterCols = ScaleFilterCols_16_SSSE3;

@@ -1023,13 +1107,13 @@

   for (j = 0; j < dst_height; ++j) {

     int yi = y >> 16;

-    const uint16* src = src_ptr + yi * src_stride;

+    const uint16_t* src = src_ptr + yi * src_stride;

     if (filtering == kFilterLinear) {

       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);

     } else {

       int yf = (y >> 8) & 255;

-      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);

-      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);

+      InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);

+      ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);

     dst_ptr += dst_stride;

     y += dy;

@@ -1041,10 +1125,14 @@

 // Scale up down with bilinear interpolation.

-void ScalePlaneBilinearUp(int src_width, int src_height,

-                          int dst_width, int dst_height,

-                          int src_stride, int dst_stride,

-                          const uint8* src_ptr, uint8* dst_ptr,

+void ScalePlaneBilinearUp(int src_width,

+                          int src_height,

+                          int dst_width,

+                          int dst_height,

+                          int src_stride,

+                          int dst_stride,

+                          const uint8_t* src_ptr,

+                          uint8_t* dst_ptr,

                           enum FilterMode filtering) {

   int j;

   // Initial source x/y coordinate and step values as 16.16 fixed point.

@@ -1053,14 +1141,14 @@

   int dx = 0;

   int dy = 0;

   const int max_y = (src_height - 1) << 16;

-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_C;

-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,

-      int dst_width, int x, int dx) =

+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_C;

+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,

+                          int dst_width, int x, int dx) =

       filtering ? ScaleFilterCols_C : ScaleCols_C;

-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

-             &x, &y, &dx, &dy);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

 #if defined(HAS_INTERPOLATEROW_SSSE3)

@@ -1087,14 +1175,6 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2)) {

-    InterpolateRow = InterpolateRow_Any_DSPR2;

-    if (IS_ALIGNED(dst_width, 4)) {

-      InterpolateRow = InterpolateRow_DSPR2;

-    }

-  }

-#endif

   if (filtering && src_width >= 32768) {

     ScaleFilterCols = ScaleFilterCols64_C;

@@ -1112,6 +1192,14 @@

 #endif

+#if defined(HAS_SCALEFILTERCOLS_MSA)

+  if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {

+    ScaleFilterCols = ScaleFilterCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 16)) {

+      ScaleFilterCols = ScaleFilterCols_MSA;

+    }

+  }

+#endif

   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {

     ScaleFilterCols = ScaleColsUp2_C;

 #if defined(HAS_SCALECOLS_SSE2)

@@ -1126,13 +1214,13 @@

     int yi = y >> 16;

-    const uint8* src = src_ptr + yi * src_stride;

+    const uint8_t* src = src_ptr + yi * src_stride;

     // Allocate 2 row buffers.

     const int kRowSize = (dst_width + 31) & ~31;

     align_buffer_64(row, kRowSize * 2);

-    uint8* rowptr = row;

+    uint8_t* rowptr = row;

     int rowstride = kRowSize;

     int lasty = yi;

@@ -1172,10 +1260,14 @@

-void ScalePlaneBilinearUp_16(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint16* src_ptr, uint16* dst_ptr,

+void ScalePlaneBilinearUp_16(int src_width,

+                             int src_height,

+                             int dst_width,

+                             int dst_height,

+                             int src_stride,

+                             int dst_stride,

+                             const uint16_t* src_ptr,

+                             uint16_t* dst_ptr,

                              enum FilterMode filtering) {

   int j;

   // Initial source x/y coordinate and step values as 16.16 fixed point.

@@ -1184,14 +1276,14 @@

   int dx = 0;

   int dy = 0;

   const int max_y = (src_height - 1) << 16;

-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_16_C;

-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,

-      int dst_width, int x, int dx) =

+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_16_C;

+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,

+                          int dst_width, int x, int dx) =

       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;

-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

-             &x, &y, &dx, &dy);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

 #if defined(HAS_INTERPOLATEROW_16_SSE2)

@@ -1226,14 +1318,6 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_16_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2)) {

-    InterpolateRow = InterpolateRow_Any_16_DSPR2;

-    if (IS_ALIGNED(dst_width, 4)) {

-      InterpolateRow = InterpolateRow_16_DSPR2;

-    }

-  }

-#endif

   if (filtering && src_width >= 32768) {

     ScaleFilterCols = ScaleFilterCols64_16_C;

@@ -1257,13 +1341,13 @@

     int yi = y >> 16;

-    const uint16* src = src_ptr + yi * src_stride;

+    const uint16_t* src = src_ptr + yi * src_stride;

     // Allocate 2 row buffers.

     const int kRowSize = (dst_width + 31) & ~31;

     align_buffer_64(row, kRowSize * 4);

-    uint16* rowptr = (uint16*)row;

+    uint16_t* rowptr = (uint16_t*)row;

     int rowstride = kRowSize;

     int lasty = yi;

@@ -1308,20 +1392,24 @@

 // of x and dx is the integer part of the source position and

 // the lower 16 bits are the fixed decimal part.

-static void ScalePlaneSimple(int src_width, int src_height,

-                             int dst_width, int dst_height,

-                             int src_stride, int dst_stride,

-                             const uint8* src_ptr, uint8* dst_ptr) {

+static void ScalePlaneSimple(int src_width,

+                             int src_height,

+                             int dst_width,

+                             int dst_height,

+                             int src_stride,

+                             int dst_stride,

+                             const uint8_t* src_ptr,

+                             uint8_t* dst_ptr) {

   int i;

-  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,

-      int dst_width, int x, int dx) = ScaleCols_C;

+  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,

+                    int x, int dx) = ScaleCols_C;

   // Initial source x/y coordinate and step values as 16.16 fixed point.

   int x = 0;

   int y = 0;

   int dx = 0;

   int dy = 0;

-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,

-             &x, &y, &dx, &dy);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

   if (src_width * 2 == dst_width && x < 0x8000) {

@@ -1340,20 +1428,24 @@

-static void ScalePlaneSimple_16(int src_width, int src_height,

-                                int dst_width, int dst_height,

-                                int src_stride, int dst_stride,

-                                const uint16* src_ptr, uint16* dst_ptr) {

+static void ScalePlaneSimple_16(int src_width,

+                                int src_height,

+                                int dst_width,

+                                int dst_height,

+                                int src_stride,

+                                int dst_stride,

+                                const uint16_t* src_ptr,

+                                uint16_t* dst_ptr) {

   int i;

-  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,

-      int dst_width, int x, int dx) = ScaleCols_16_C;

+  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,

+                    int x, int dx) = ScaleCols_16_C;

   // Initial source x/y coordinate and step values as 16.16 fixed point.

   int x = 0;

   int y = 0;

   int dx = 0;

   int dy = 0;

-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,

-             &x, &y, &dx, &dy);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

   if (src_width * 2 == dst_width && x < 0x8000) {

@@ -1366,8 +1458,7 @@

   for (i = 0; i < dst_height; ++i) {

-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,

-              dst_width, x, dx);

+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);

     dst_ptr += dst_stride;

     y += dy;

@@ -1377,14 +1468,18 @@

 // This function dispatches to a specialized scaler based on scale factor.

 LIBYUV_API

-void ScalePlane(const uint8* src, int src_stride,

-                int src_width, int src_height,

-                uint8* dst, int dst_stride,

-                int dst_width, int dst_height,

+void ScalePlane(const uint8_t* src,

+                int src_stride,

+                int src_width,

+                int src_height,

+                uint8_t* dst,

+                int dst_stride,

+                int dst_width,

+                int dst_height,

                 enum FilterMode filtering) {

   // Simplify filtering when possible.

-  filtering = ScaleFilterReduce(src_width, src_height,

-                                dst_width, dst_height, filtering);

+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,

+                                filtering);

   // Negative height means invert the image.

   if (src_height < 0) {

@@ -1403,46 +1498,42 @@

   if (dst_width == src_width && filtering != kFilterBox) {

     int dy = FixedDiv(src_height, dst_height);

     // Arbitrary scale vertically, but unscaled horizontally.

-    ScalePlaneVertical(src_height,

-                       dst_width, dst_height,

-                       src_stride, dst_stride, src, dst,

-                       0, 0, dy, 1, filtering);

+    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,

+                       dst_stride, src, dst, 0, 0, dy, 1, filtering);

     return;

   if (dst_width <= Abs(src_width) && dst_height <= src_height) {

     // Scale down.

-    if (4 * dst_width == 3 * src_width &&

-        4 * dst_height == 3 * src_height) {

+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {

       // optimized, 3/4

-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,

-                       src_stride, dst_stride, src, dst, filtering);

+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,

+                       dst_stride, src, dst, filtering);

       return;

     if (2 * dst_width == src_width && 2 * dst_height == src_height) {

       // optimized, 1/2

-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,

-                      src_stride, dst_stride, src, dst, filtering);

+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,

+                      dst_stride, src, dst, filtering);

       return;

     // 3/8 rounded up for odd sized chroma height.

-    if (8 * dst_width == 3 * src_width &&

-        dst_height == ((src_height * 3 + 7) / 8)) {

+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {

       // optimized, 3/8

-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,

-                       src_stride, dst_stride, src, dst, filtering);

+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,

+                       dst_stride, src, dst, filtering);

       return;

     if (4 * dst_width == src_width && 4 * dst_height == src_height &&

         (filtering == kFilterBox || filtering == kFilterNone)) {

       // optimized, 1/4

-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,

-                      src_stride, dst_stride, src, dst, filtering);

+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,

+                      dst_stride, src, dst, filtering);

       return;

   if (filtering == kFilterBox && dst_height * 2 < src_height) {

-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,

-                  src_stride, dst_stride, src, dst);

+    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,

+                  dst_stride, src, dst);

     return;

   if (filtering && dst_height > src_height) {

@@ -1455,19 +1546,23 @@

                            src_stride, dst_stride, src, dst, filtering);

     return;

-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,

-                   src_stride, dst_stride, src, dst);

+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,

+                   dst_stride, src, dst);

 LIBYUV_API

-void ScalePlane_16(const uint16* src, int src_stride,

-                  int src_width, int src_height,

-                  uint16* dst, int dst_stride,

-                  int dst_width, int dst_height,

-                  enum FilterMode filtering) {

+void ScalePlane_16(const uint16_t* src,

+                   int src_stride,

+                   int src_width,

+                   int src_height,

+                   uint16_t* dst,

+                   int dst_stride,

+                   int dst_width,

+                   int dst_height,

+                   enum FilterMode filtering) {

   // Simplify filtering when possible.

-  filtering = ScaleFilterReduce(src_width, src_height,

-                                dst_width, dst_height, filtering);

+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,

+                                filtering);

   // Negative height means invert the image.

   if (src_height < 0) {

@@ -1483,19 +1578,16 @@

     CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);

     return;

-  if (dst_width == src_width) {

+  if (dst_width == src_width && filtering != kFilterBox) {

     int dy = FixedDiv(src_height, dst_height);

     // Arbitrary scale vertically, but unscaled vertically.

-    ScalePlaneVertical_16(src_height,

-                          dst_width, dst_height,

-                          src_stride, dst_stride, src, dst,

-                          0, 0, dy, 1, filtering);

+    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,

+                          dst_stride, src, dst, 0, 0, dy, 1, filtering);

     return;

   if (dst_width <= Abs(src_width) && dst_height <= src_height) {

     // Scale down.

-    if (4 * dst_width == 3 * src_width &&

-        4 * dst_height == 3 * src_height) {

+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {

       // optimized, 3/4

       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,

                           src_stride, dst_stride, src, dst, filtering);

@@ -1508,8 +1600,7 @@

       return;

     // 3/8 rounded up for odd sized chroma height.

-    if (8 * dst_width == 3 * src_width &&

-        dst_height == ((src_height * 3 + 7) / 8)) {

+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {

       // optimized, 3/8

       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,

                           src_stride, dst_stride, src, dst, filtering);

@@ -1516,7 +1607,7 @@

       return;

     if (4 * dst_width == src_width && 4 * dst_height == src_height &&

-               filtering != kFilterBilinear) {

+        (filtering == kFilterBox || filtering == kFilterNone)) {

       // optimized, 1/4

       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,

                          src_stride, dst_stride, src, dst, filtering);

@@ -1524,8 +1615,8 @@

   if (filtering == kFilterBox && dst_height * 2 < src_height) {

-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,

-                     src_stride, dst_stride, src, dst);

+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,

+                     dst_stride, src, dst);

     return;

   if (filtering && dst_height > src_height) {

@@ -1538,8 +1629,8 @@

                               src_stride, dst_stride, src, dst, filtering);

     return;

-  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,

-                      src_stride, dst_stride, src, dst);

+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,

+                      dst_stride, src, dst);

 // Scale an I420 image.

@@ -1546,14 +1637,22 @@

 // This function in turn calls a scaling function for each plane.

 LIBYUV_API

-int I420Scale(const uint8* src_y, int src_stride_y,

-              const uint8* src_u, int src_stride_u,

-              const uint8* src_v, int src_stride_v,

-              int src_width, int src_height,

-              uint8* dst_y, int dst_stride_y,

-              uint8* dst_u, int dst_stride_u,

-              uint8* dst_v, int dst_stride_v,

-              int dst_width, int dst_height,

+int I420Scale(const uint8_t* src_y,

+              int src_stride_y,

+              const uint8_t* src_u,

+              int src_stride_u,

+              const uint8_t* src_v,

+              int src_stride_v,

+              int src_width,

+              int src_height,

+              uint8_t* dst_y,

+              int dst_stride_y,

+              uint8_t* dst_u,

+              int dst_stride_u,

+              uint8_t* dst_v,

+              int dst_stride_v,

+              int dst_width,

+              int dst_height,

               enum FilterMode filtering) {

   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);

   int src_halfheight = SUBSAMPLE(src_height, 1, 1);

@@ -1560,32 +1659,37 @@

   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);

   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||

-      src_width > 32768 || src_height > 32768 ||

-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {

+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||

+      dst_width <= 0 || dst_height <= 0) {

     return -1;

-  ScalePlane(src_y, src_stride_y, src_width, src_height,

-             dst_y, dst_stride_y, dst_width, dst_height,

-             filtering);

-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,

-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

-             filtering);

-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,

-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

-             filtering);

+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,

+             dst_width, dst_height, filtering);

+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,

+             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);

+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,

+             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);

   return 0;

 LIBYUV_API

-int I420Scale_16(const uint16* src_y, int src_stride_y,

-                 const uint16* src_u, int src_stride_u,

-                 const uint16* src_v, int src_stride_v,

-                 int src_width, int src_height,

-                 uint16* dst_y, int dst_stride_y,

-                 uint16* dst_u, int dst_stride_u,

-                 uint16* dst_v, int dst_stride_v,

-                 int dst_width, int dst_height,

+int I420Scale_16(const uint16_t* src_y,

+                 int src_stride_y,

+                 const uint16_t* src_u,

+                 int src_stride_u,

+                 const uint16_t* src_v,

+                 int src_stride_v,

+                 int src_width,

+                 int src_height,

+                 uint16_t* dst_y,

+                 int dst_stride_y,

+                 uint16_t* dst_u,

+                 int dst_stride_u,

+                 uint16_t* dst_v,

+                 int dst_stride_v,

+                 int dst_width,

+                 int dst_height,

                  enum FilterMode filtering) {

   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);

   int src_halfheight = SUBSAMPLE(src_height, 1, 1);

@@ -1592,78 +1696,43 @@

   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);

   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||

-      src_width > 32768 || src_height > 32768 ||

-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {

+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||

+      dst_width <= 0 || dst_height <= 0) {

     return -1;

-  ScalePlane_16(src_y, src_stride_y, src_width, src_height,

-                dst_y, dst_stride_y, dst_width, dst_height,

-                filtering);

-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,

-                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,

-                filtering);

-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,

-                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,

-                filtering);

+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,

+                dst_width, dst_height, filtering);

+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,

+                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);

+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,

+                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);

   return 0;

 // Deprecated api

 LIBYUV_API

-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,

-          int src_stride_y, int src_stride_u, int src_stride_v,

-          int src_width, int src_height,

-          uint8* dst_y, uint8* dst_u, uint8* dst_v,

-          int dst_stride_y, int dst_stride_u, int dst_stride_v,

-          int dst_width, int dst_height,

+int Scale(const uint8_t* src_y,

+          const uint8_t* src_u,

+          const uint8_t* src_v,

+          int src_stride_y,

+          int src_stride_u,

+          int src_stride_v,

+          int src_width,

+          int src_height,

+          uint8_t* dst_y,

+          uint8_t* dst_u,

+          uint8_t* dst_v,

+          int dst_stride_y,

+          int dst_stride_u,

+          int dst_stride_v,

+          int dst_width,

+          int dst_height,

           LIBYUV_BOOL interpolate) {

-  return I420Scale(src_y, src_stride_y,

-                   src_u, src_stride_u,

-                   src_v, src_stride_v,

-                   src_width, src_height,

-                   dst_y, dst_stride_y,

-                   dst_u, dst_stride_u,

-                   dst_v, dst_stride_v,

-                   dst_width, dst_height,

-                   interpolate ? kFilterBox : kFilterNone);

-}

-// Deprecated api

-LIBYUV_API

-int ScaleOffset(const uint8* src, int src_width, int src_height,

-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,

-                LIBYUV_BOOL interpolate) {

-  // Chroma requires offset to multiple of 2.

-  int dst_yoffset_even = dst_yoffset & ~1;

-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);

-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);

-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);

-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);

-  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height

-  const uint8* src_y = src;

-  const uint8* src_u = src + src_width * src_height;

-  const uint8* src_v = src + src_width * src_height +

-                             src_halfwidth * src_halfheight;

-  uint8* dst_y = dst + dst_yoffset_even * dst_width;

-  uint8* dst_u = dst + dst_width * dst_height +

-                 (dst_yoffset_even >> 1) * dst_halfwidth;

-  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +

-                 (dst_yoffset_even >> 1) * dst_halfwidth;

-  if (!src || src_width <= 0 || src_height <= 0 ||

-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||

-      dst_yoffset_even >= dst_height) {

-    return -1;

-  }

-  return I420Scale(src_y, src_width,

-                   src_u, src_halfwidth,

-                   src_v, src_halfwidth,

-                   src_width, src_height,

-                   dst_y, dst_width,

-                   dst_u, dst_halfwidth,

-                   dst_v, dst_halfwidth,

-                   dst_width, aheight,

-                   interpolate ? kFilterBox : kFilterNone);

+  return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,

+                   src_stride_v, src_width, src_height, dst_y, dst_stride_y,

+                   dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,

+                   dst_height, interpolate ? kFilterBox : kFilterNone);

 #ifdef __cplusplus

--- a/third_party/libyuv/source/scale_any.cc

+++ b/third_party/libyuv/source/scale_any.cc

@@ -20,184 +20,429 @@

 // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols

 #define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \

-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \

-                 int dst_width, int x, int dx) {                               \

-      int n = dst_width & ~MASK;                                               \

-      if (n > 0) {                                                             \

-        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \

-      }                                                                        \

-      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \

-             dst_width & MASK, x + n * dx, dx);                                \

-    }

+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \

+               int dx) {                                                       \

+    int r = dst_width & MASK;                                                  \

+    int n = dst_width & ~MASK;                                                 \

+    if (n > 0) {                                                               \

+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \

+    }                                                                          \

+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \

+  }

 #ifdef HAS_SCALEFILTERCOLS_NEON

 CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)

 #endif

+#ifdef HAS_SCALEFILTERCOLS_MSA

+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)

+#endif

 #ifdef HAS_SCALEARGBCOLS_NEON

 CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)

 #endif

+#ifdef HAS_SCALEARGBCOLS_MSA

+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)

+#endif

 #ifdef HAS_SCALEARGBFILTERCOLS_NEON

-CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,

-     ScaleARGBFilterCols_C, 4, 3)

+CANY(ScaleARGBFilterCols_Any_NEON,

+     ScaleARGBFilterCols_NEON,

+     ScaleARGBFilterCols_C,

+     4,

+     3)

 #endif

+#ifdef HAS_SCALEARGBFILTERCOLS_MSA

+CANY(ScaleARGBFilterCols_Any_MSA,

+     ScaleARGBFilterCols_MSA,

+     ScaleARGBFilterCols_C,

+     4,

+     7)

+#endif

 #undef CANY

 // Fixed scale down.

+// Mask may be non-power of 2, so use MOD

 #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \

-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \

-                 uint8* dst_ptr, int dst_width) {                              \

-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \

-      int n = dst_width - r;                                                   \

-      if (n > 0) {                                                             \

-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \

-      }                                                                        \

-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \

-                     dst_ptr + n * BPP, r);                                    \

-    }

+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \

+               int dst_width) {                                                \

+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \

+    int n = dst_width - r;                                                     \

+    if (n > 0) {                                                               \

+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \

+    }                                                                          \

+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \

+                   dst_ptr + n * BPP, r);                                      \

+  }

 // Fixed scale down for odd source width.  Used by I420Blend subsampling.

 // Since dst_width is (width + 1) / 2, this function scales one less pixel

 // and copies the last pixel.

 #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \

-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \

-                 uint8* dst_ptr, int dst_width) {                              \

-      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \

-      int n = dst_width - r;                                                   \

-      if (n > 0) {                                                             \

-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \

-      }                                                                        \

-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \

-                     dst_ptr + n * BPP, r);                                    \

-    }

+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \

+               int dst_width) {                                                \

+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \

+    int n = (dst_width - 1) - r;                                               \

+    if (n > 0) {                                                               \

+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \

+    }                                                                          \

+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \

+                   dst_ptr + n * BPP, r + 1);                                  \

+  }

 #ifdef HAS_SCALEROWDOWN2_SSSE3

 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)

-SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,

-      ScaleRowDown2Linear_C, 2, 1, 15)

-SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,

-      2, 1, 15)

-SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,

-      ScaleRowDown2Box_Odd_C, 2, 1, 15)

+SDANY(ScaleRowDown2Linear_Any_SSSE3,

+      ScaleRowDown2Linear_SSSE3,

+      ScaleRowDown2Linear_C,

+      2,

+      1,

+      15)

+SDANY(ScaleRowDown2Box_Any_SSSE3,

+      ScaleRowDown2Box_SSSE3,

+      ScaleRowDown2Box_C,

+      2,

+      1,

+      15)

+SDODD(ScaleRowDown2Box_Odd_SSSE3,

+      ScaleRowDown2Box_SSSE3,

+      ScaleRowDown2Box_Odd_C,

+      2,

+      1,

+      15)

 #endif

 #ifdef HAS_SCALEROWDOWN2_AVX2

 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)

-SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,

-      ScaleRowDown2Linear_C, 2, 1, 31)

-SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,

-      2, 1, 31)

-SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,

-      2, 1, 31)

+SDANY(ScaleRowDown2Linear_Any_AVX2,

+      ScaleRowDown2Linear_AVX2,

+      ScaleRowDown2Linear_C,

+      2,

+      1,

+      31)

+SDANY(ScaleRowDown2Box_Any_AVX2,

+      ScaleRowDown2Box_AVX2,

+      ScaleRowDown2Box_C,

+      2,

+      1,

+      31)

+SDODD(ScaleRowDown2Box_Odd_AVX2,

+      ScaleRowDown2Box_AVX2,

+      ScaleRowDown2Box_Odd_C,

+      2,

+      1,

+      31)

 #endif

 #ifdef HAS_SCALEROWDOWN2_NEON

 SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)

-SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,

-      ScaleRowDown2Linear_C, 2, 1, 15)

-SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,

-      ScaleRowDown2Box_C, 2, 1, 15)

-SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,

-      ScaleRowDown2Box_Odd_C, 2, 1, 15)

+SDANY(ScaleRowDown2Linear_Any_NEON,

+      ScaleRowDown2Linear_NEON,

+      ScaleRowDown2Linear_C,

+      2,

+      1,

+      15)

+SDANY(ScaleRowDown2Box_Any_NEON,

+      ScaleRowDown2Box_NEON,

+      ScaleRowDown2Box_C,

+      2,

+      1,

+      15)

+SDODD(ScaleRowDown2Box_Odd_NEON,

+      ScaleRowDown2Box_NEON,

+      ScaleRowDown2Box_Odd_C,

+      2,

+      1,

+      15)

 #endif

+#ifdef HAS_SCALEROWDOWN2_MSA

+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)

+SDANY(ScaleRowDown2Linear_Any_MSA,

+      ScaleRowDown2Linear_MSA,

+      ScaleRowDown2Linear_C,

+      2,

+      1,

+      31)

+SDANY(ScaleRowDown2Box_Any_MSA,

+      ScaleRowDown2Box_MSA,

+      ScaleRowDown2Box_C,

+      2,

+      1,

+      31)

+#endif

 #ifdef HAS_SCALEROWDOWN4_SSSE3

 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)

-SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,

-      4, 1, 7)

+SDANY(ScaleRowDown4Box_Any_SSSE3,

+      ScaleRowDown4Box_SSSE3,

+      ScaleRowDown4Box_C,

+      4,

+      1,

+      7)

 #endif

 #ifdef HAS_SCALEROWDOWN4_AVX2

 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)

-SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,

-      4, 1, 15)

+SDANY(ScaleRowDown4Box_Any_AVX2,

+      ScaleRowDown4Box_AVX2,

+      ScaleRowDown4Box_C,

+      4,

+      1,

+      15)

 #endif

 #ifdef HAS_SCALEROWDOWN4_NEON

 SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)

-SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,

-      4, 1, 7)

+SDANY(ScaleRowDown4Box_Any_NEON,

+      ScaleRowDown4Box_NEON,

+      ScaleRowDown4Box_C,

+      4,

+      1,

+      7)

 #endif

+#ifdef HAS_SCALEROWDOWN4_MSA

+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)

+SDANY(ScaleRowDown4Box_Any_MSA,

+      ScaleRowDown4Box_MSA,

+      ScaleRowDown4Box_C,

+      4,

+      1,

+      15)

+#endif

 #ifdef HAS_SCALEROWDOWN34_SSSE3

-SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,

-      ScaleRowDown34_C, 4 / 3, 1, 23)

-SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,

-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)

-SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,

-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)

+SDANY(ScaleRowDown34_Any_SSSE3,

+      ScaleRowDown34_SSSE3,

+      ScaleRowDown34_C,

+      4 / 3,

+      1,

+      23)

+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,

+      ScaleRowDown34_0_Box_SSSE3,

+      ScaleRowDown34_0_Box_C,

+      4 / 3,

+      1,

+      23)

+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,

+      ScaleRowDown34_1_Box_SSSE3,

+      ScaleRowDown34_1_Box_C,

+      4 / 3,

+      1,

+      23)

 #endif

 #ifdef HAS_SCALEROWDOWN34_NEON

-SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,

-      ScaleRowDown34_C, 4 / 3, 1, 23)

-SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,

-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)

-SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,

-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)

+SDANY(ScaleRowDown34_Any_NEON,

+      ScaleRowDown34_NEON,

+      ScaleRowDown34_C,

+      4 / 3,

+      1,

+      23)

+SDANY(ScaleRowDown34_0_Box_Any_NEON,

+      ScaleRowDown34_0_Box_NEON,

+      ScaleRowDown34_0_Box_C,

+      4 / 3,

+      1,

+      23)

+SDANY(ScaleRowDown34_1_Box_Any_NEON,

+      ScaleRowDown34_1_Box_NEON,

+      ScaleRowDown34_1_Box_C,

+      4 / 3,

+      1,

+      23)

 #endif

+#ifdef HAS_SCALEROWDOWN34_MSA

+SDANY(ScaleRowDown34_Any_MSA,

+      ScaleRowDown34_MSA,

+      ScaleRowDown34_C,

+      4 / 3,

+      1,

+      47)

+SDANY(ScaleRowDown34_0_Box_Any_MSA,

+      ScaleRowDown34_0_Box_MSA,

+      ScaleRowDown34_0_Box_C,

+      4 / 3,

+      1,

+      47)

+SDANY(ScaleRowDown34_1_Box_Any_MSA,

+      ScaleRowDown34_1_Box_MSA,

+      ScaleRowDown34_1_Box_C,

+      4 / 3,

+      1,

+      47)

+#endif

 #ifdef HAS_SCALEROWDOWN38_SSSE3

-SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,

-      ScaleRowDown38_C, 8 / 3, 1, 11)

-SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,

-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)

-SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,

-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)

+SDANY(ScaleRowDown38_Any_SSSE3,

+      ScaleRowDown38_SSSE3,

+      ScaleRowDown38_C,

+      8 / 3,

+      1,

+      11)

+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,

+      ScaleRowDown38_3_Box_SSSE3,

+      ScaleRowDown38_3_Box_C,

+      8 / 3,

+      1,

+      5)

+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,

+      ScaleRowDown38_2_Box_SSSE3,

+      ScaleRowDown38_2_Box_C,

+      8 / 3,

+      1,

+      5)

 #endif

 #ifdef HAS_SCALEROWDOWN38_NEON

-SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,

-      ScaleRowDown38_C, 8 / 3, 1, 11)

-SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,

-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)

-SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,

-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)

+SDANY(ScaleRowDown38_Any_NEON,

+      ScaleRowDown38_NEON,

+      ScaleRowDown38_C,

+      8 / 3,

+      1,

+      11)

+SDANY(ScaleRowDown38_3_Box_Any_NEON,

+      ScaleRowDown38_3_Box_NEON,

+      ScaleRowDown38_3_Box_C,

+      8 / 3,

+      1,

+      11)

+SDANY(ScaleRowDown38_2_Box_Any_NEON,

+      ScaleRowDown38_2_Box_NEON,

+      ScaleRowDown38_2_Box_C,

+      8 / 3,

+      1,

+      11)

 #endif

+#ifdef HAS_SCALEROWDOWN38_MSA

+SDANY(ScaleRowDown38_Any_MSA,

+      ScaleRowDown38_MSA,

+      ScaleRowDown38_C,

+      8 / 3,

+      1,

+      11)

+SDANY(ScaleRowDown38_3_Box_Any_MSA,

+      ScaleRowDown38_3_Box_MSA,

+      ScaleRowDown38_3_Box_C,

+      8 / 3,

+      1,

+      11)

+SDANY(ScaleRowDown38_2_Box_Any_MSA,

+      ScaleRowDown38_2_Box_MSA,

+      ScaleRowDown38_2_Box_C,

+      8 / 3,

+      1,

+      11)

+#endif

 #ifdef HAS_SCALEARGBROWDOWN2_SSE2

-SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,

-      ScaleARGBRowDown2_C, 2, 4, 3)

-SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,

-      ScaleARGBRowDown2Linear_C, 2, 4, 3)

-SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,

-      ScaleARGBRowDown2Box_C, 2, 4, 3)

+SDANY(ScaleARGBRowDown2_Any_SSE2,

+      ScaleARGBRowDown2_SSE2,

+      ScaleARGBRowDown2_C,

+      2,

+      4,

+      3)

+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,

+      ScaleARGBRowDown2Linear_SSE2,

+      ScaleARGBRowDown2Linear_C,

+      2,

+      4,

+      3)

+SDANY(ScaleARGBRowDown2Box_Any_SSE2,

+      ScaleARGBRowDown2Box_SSE2,

+      ScaleARGBRowDown2Box_C,

+      2,

+      4,

+      3)

 #endif

 #ifdef HAS_SCALEARGBROWDOWN2_NEON

-SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,

-      ScaleARGBRowDown2_C, 2, 4, 7)

-SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,

-      ScaleARGBRowDown2Linear_C, 2, 4, 7)

-SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,

-      ScaleARGBRowDown2Box_C, 2, 4, 7)

+SDANY(ScaleARGBRowDown2_Any_NEON,

+      ScaleARGBRowDown2_NEON,

+      ScaleARGBRowDown2_C,

+      2,

+      4,

+      7)

+SDANY(ScaleARGBRowDown2Linear_Any_NEON,

+      ScaleARGBRowDown2Linear_NEON,

+      ScaleARGBRowDown2Linear_C,

+      2,

+      4,

+      7)

+SDANY(ScaleARGBRowDown2Box_Any_NEON,

+      ScaleARGBRowDown2Box_NEON,

+      ScaleARGBRowDown2Box_C,

+      2,

+      4,

+      7)

 #endif

+#ifdef HAS_SCALEARGBROWDOWN2_MSA

+SDANY(ScaleARGBRowDown2_Any_MSA,

+      ScaleARGBRowDown2_MSA,

+      ScaleARGBRowDown2_C,

+      2,

+      4,

+      3)

+SDANY(ScaleARGBRowDown2Linear_Any_MSA,

+      ScaleARGBRowDown2Linear_MSA,

+      ScaleARGBRowDown2Linear_C,

+      2,

+      4,

+      3)

+SDANY(ScaleARGBRowDown2Box_Any_MSA,

+      ScaleARGBRowDown2Box_MSA,

+      ScaleARGBRowDown2Box_C,

+      2,

+      4,

+      3)

+#endif

 #undef SDANY

 // Scale down by even scale factor.

-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \

-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \

-                 uint8* dst_ptr, int dst_width) {                              \

-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \

-      int n = dst_width - r;                                                   \

-      if (n > 0) {                                                             \

-        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \

-      }                                                                        \

-      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \

-                     src_stepx, dst_ptr + n * BPP, r);                         \

-    }

+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \

+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \

+               uint8_t* dst_ptr, int dst_width) {                           \

+    int r = dst_width & MASK;                                               \

+    int n = dst_width & ~MASK;                                              \

+    if (n > 0) {                                                            \

+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \

+    }                                                                       \

+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \

+                   dst_ptr + n * BPP, r);                                   \

+  }

 #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2

-SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,

-       ScaleARGBRowDownEven_C, 4, 3)

-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,

-       ScaleARGBRowDownEvenBox_C, 4, 3)

+SDAANY(ScaleARGBRowDownEven_Any_SSE2,

+       ScaleARGBRowDownEven_SSE2,

+       ScaleARGBRowDownEven_C,

+       4,

+       3)

+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,

+       ScaleARGBRowDownEvenBox_SSE2,

+       ScaleARGBRowDownEvenBox_C,

+       4,

+       3)

 #endif

 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON

-SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,

-       ScaleARGBRowDownEven_C, 4, 3)

-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,

-       ScaleARGBRowDownEvenBox_C, 4, 3)

+SDAANY(ScaleARGBRowDownEven_Any_NEON,

+       ScaleARGBRowDownEven_NEON,

+       ScaleARGBRowDownEven_C,

+       4,

+       3)

+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,

+       ScaleARGBRowDownEvenBox_NEON,

+       ScaleARGBRowDownEvenBox_C,

+       4,

+       3)

 #endif

+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA

+SDAANY(ScaleARGBRowDownEven_Any_MSA,

+       ScaleARGBRowDownEven_MSA,

+       ScaleARGBRowDownEven_C,

+       4,

+       3)

+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,

+       ScaleARGBRowDownEvenBox_MSA,

+       ScaleARGBRowDownEvenBox_C,

+       4,

+       3)

+#endif

 // Add rows box filter scale down.

-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \

-  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \

-      int n = src_width & ~MASK;                                               \

-      if (n > 0) {                                                             \

-        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \

-      }                                                                        \

-      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \

-    }

+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \

+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \

+    int n = src_width & ~MASK;                                             \

+    if (n > 0) {                                                           \

+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \

+    }                                                                      \

+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \

+  }

 #ifdef HAS_SCALEADDROW_SSE2

 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)

@@ -208,6 +453,9 @@

 #ifdef HAS_SCALEADDROW_NEON

 SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)

 #endif

+#ifdef HAS_SCALEADDROW_MSA

+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)

+#endif

 #undef SAANY

 #ifdef __cplusplus

@@ -214,8 +462,3 @@

 }  // extern "C"

 }  // namespace libyuv

 #endif

--- a/third_party/libyuv/source/scale_argb.cc

+++ b/third_party/libyuv/source/scale_argb.cc

@@ -30,20 +30,31 @@

 // ScaleARGB ARGB, 1/2

 // This is an optimized version for scaling down a ARGB to 1/2 of

 // its original size.

-static void ScaleARGBDown2(int src_width, int src_height,

-                           int dst_width, int dst_height,

-                           int src_stride, int dst_stride,

-                           const uint8* src_argb, uint8* dst_argb,

-                           int x, int dx, int y, int dy,

+static void ScaleARGBDown2(int src_width,

+                           int src_height,

+                           int dst_width,

+                           int dst_height,

+                           int src_stride,

+                           int dst_stride,

+                           const uint8_t* src_argb,

+                           uint8_t* dst_argb,

+                           int x,

+                           int dx,

+                           int y,

+                           int dy,

                            enum FilterMode filtering) {

   int j;

   int row_stride = src_stride * (dy >> 16);

-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,

-                            uint8* dst_argb, int dst_width) =

-    filtering == kFilterNone ? ScaleARGBRowDown2_C :

-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :

-        ScaleARGBRowDown2Box_C);

-  assert(dx == 65536 * 2);  // Test scale factor of 2.

+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,

+                            uint8_t* dst_argb, int dst_width) =

+      filtering == kFilterNone

+          ? ScaleARGBRowDown2_C

+          : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C

+                                        : ScaleARGBRowDown2Box_C);

+  (void)src_width;

+  (void)src_height;

+  (void)dx;

+  assert(dx == 65536 * 2);      // Test scale factor of 2.

   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.

   // Advance to odd row, even column.

   if (filtering == kFilterBilinear) {

@@ -54,28 +65,52 @@

 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :

-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :

-        ScaleARGBRowDown2Box_Any_SSE2);

+    ScaleARGBRowDown2 =

+        filtering == kFilterNone

+            ? ScaleARGBRowDown2_Any_SSE2

+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2

+                                          : ScaleARGBRowDown2Box_Any_SSE2);

     if (IS_ALIGNED(dst_width, 4)) {

-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :

-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :

-          ScaleARGBRowDown2Box_SSE2);

+      ScaleARGBRowDown2 =

+          filtering == kFilterNone

+              ? ScaleARGBRowDown2_SSE2

+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2

+                                            : ScaleARGBRowDown2Box_SSE2);

 #endif

 #if defined(HAS_SCALEARGBROWDOWN2_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :

-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :

-        ScaleARGBRowDown2Box_Any_NEON);

+    ScaleARGBRowDown2 =

+        filtering == kFilterNone

+            ? ScaleARGBRowDown2_Any_NEON

+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON

+                                          : ScaleARGBRowDown2Box_Any_NEON);

     if (IS_ALIGNED(dst_width, 8)) {

-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :

-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :

-          ScaleARGBRowDown2Box_NEON);

+      ScaleARGBRowDown2 =

+          filtering == kFilterNone

+              ? ScaleARGBRowDown2_NEON

+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON

+                                            : ScaleARGBRowDown2Box_NEON);

 #endif

+#if defined(HAS_SCALEARGBROWDOWN2_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBRowDown2 =

+        filtering == kFilterNone

+            ? ScaleARGBRowDown2_Any_MSA

+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA

+                                          : ScaleARGBRowDown2Box_Any_MSA);

+    if (IS_ALIGNED(dst_width, 4)) {

+      ScaleARGBRowDown2 =

+          filtering == kFilterNone

+              ? ScaleARGBRowDown2_MSA

+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA

+                                            : ScaleARGBRowDown2Box_MSA);

+    }

+  }

+#endif

   if (filtering == kFilterLinear) {

     src_stride = 0;

@@ -90,21 +125,32 @@

 // ScaleARGB ARGB, 1/4

 // This is an optimized version for scaling down a ARGB to 1/4 of

 // its original size.

-static void ScaleARGBDown4Box(int src_width, int src_height,

-                              int dst_width, int dst_height,

-                              int src_stride, int dst_stride,

-                              const uint8* src_argb, uint8* dst_argb,

-                              int x, int dx, int y, int dy) {

+static void ScaleARGBDown4Box(int src_width,

+                              int src_height,

+                              int dst_width,

+                              int dst_height,

+                              int src_stride,

+                              int dst_stride,

+                              const uint8_t* src_argb,

+                              uint8_t* dst_argb,

+                              int x,

+                              int dx,

+                              int y,

+                              int dy) {

   int j;

   // Allocate 2 rows of ARGB.

   const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;

   align_buffer_64(row, kRowSize * 2);

   int row_stride = src_stride * (dy >> 16);

-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,

-    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;

+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,

+                            uint8_t* dst_argb, int dst_width) =

+      ScaleARGBRowDown2Box_C;

   // Advance to odd row, even column.

   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;

-  assert(dx == 65536 * 4);  // Test scale factor of 4.

+  (void)src_width;

+  (void)src_height;

+  (void)dx;

+  assert(dx == 65536 * 4);      // Test scale factor of 4.

   assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.

 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

@@ -125,8 +171,8 @@

   for (j = 0; j < dst_height; ++j) {

     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);

-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,

-                      row + kRowSize, dst_width * 2);

+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,

+                      dst_width * 2);

     ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);

     src_argb += row_stride;

     dst_argb += dst_stride;

@@ -137,41 +183,60 @@

 // ScaleARGB ARGB Even

 // This is an optimized version for scaling down a ARGB to even

 // multiple of its original size.

-static void ScaleARGBDownEven(int src_width, int src_height,

-                              int dst_width, int dst_height,

-                              int src_stride, int dst_stride,

-                              const uint8* src_argb, uint8* dst_argb,

-                              int x, int dx, int y, int dy,

+static void ScaleARGBDownEven(int src_width,

+                              int src_height,

+                              int dst_width,

+                              int dst_height,

+                              int src_stride,

+                              int dst_stride,

+                              const uint8_t* src_argb,

+                              uint8_t* dst_argb,

+                              int x,

+                              int dx,

+                              int y,

+                              int dy,

                               enum FilterMode filtering) {

   int j;

   int col_step = dx >> 16;

   int row_stride = (dy >> 16) * src_stride;

-  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,

-                               int src_step, uint8* dst_argb, int dst_width) =

+  void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,

+                               int src_step, uint8_t* dst_argb, int dst_width) =

       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;

+  (void)src_width;

+  (void)src_height;

   assert(IS_ALIGNED(src_width, 2));

   assert(IS_ALIGNED(src_height, 2));

   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;

 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)

   if (TestCpuFlag(kCpuHasSSE2)) {

-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :

-        ScaleARGBRowDownEven_Any_SSE2;

+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2

+                                     : ScaleARGBRowDownEven_Any_SSE2;

     if (IS_ALIGNED(dst_width, 4)) {

-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :

-          ScaleARGBRowDownEven_SSE2;

+      ScaleARGBRowDownEven =

+          filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;

 #endif

 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)

   if (TestCpuFlag(kCpuHasNEON)) {

-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :

-        ScaleARGBRowDownEven_Any_NEON;

+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON

+                                     : ScaleARGBRowDownEven_Any_NEON;

     if (IS_ALIGNED(dst_width, 4)) {

-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :

-          ScaleARGBRowDownEven_NEON;

+      ScaleARGBRowDownEven =

+          filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;

 #endif

+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA

+                                     : ScaleARGBRowDownEven_Any_MSA;

+    if (IS_ALIGNED(dst_width, 4)) {

+      ScaleARGBRowDownEven =

+          filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;

+    }

+  }

+#endif

   if (filtering == kFilterLinear) {

     src_stride = 0;

@@ -184,25 +249,32 @@

 // Scale ARGB down with bilinear interpolation.

-static void ScaleARGBBilinearDown(int src_width, int src_height,

-                                  int dst_width, int dst_height,

-                                  int src_stride, int dst_stride,

-                                  const uint8* src_argb, uint8* dst_argb,

-                                  int x, int dx, int y, int dy,

+static void ScaleARGBBilinearDown(int src_width,

+                                  int src_height,

+                                  int dst_width,

+                                  int dst_height,

+                                  int src_stride,

+                                  int dst_stride,

+                                  const uint8_t* src_argb,

+                                  uint8_t* dst_argb,

+                                  int x,

+                                  int dx,

+                                  int y,

+                                  int dy,

                                   enum FilterMode filtering) {

   int j;

-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_C;

-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,

-      int dst_width, int x, int dx) =

+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_C;

+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,

+                              int dst_width, int x, int dx) =

       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;

-  int64 xlast = x + (int64)(dst_width - 1) * dx;

-  int64 xl = (dx >= 0) ? x : xlast;

-  int64 xr = (dx >= 0) ? xlast : x;

+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;

+  int64_t xl = (dx >= 0) ? x : xlast;

+  int64_t xr = (dx >= 0) ? xlast : x;

   int clip_src_width;

-  xl = (xl >> 16) & ~3;  // Left edge aligned.

-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.

+  xl = (xl >> 16) & ~3;    // Left edge aligned.

+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.

   xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.

   if (xr > src_width) {

     xr = src_width;

@@ -234,12 +306,11 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {

-    InterpolateRow = InterpolateRow_Any_DSPR2;

-    if (IS_ALIGNED(clip_src_width, 4)) {

-      InterpolateRow = InterpolateRow_DSPR2;

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(clip_src_width, 32)) {

+      InterpolateRow = InterpolateRow_MSA;

 #endif

@@ -256,6 +327,14 @@

 #endif

+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 8)) {

+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;

+    }

+  }

+#endif

   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.

   // Allocate a row of ARGB.

@@ -267,7 +346,7 @@

     for (j = 0; j < dst_height; ++j) {

       int yi = y >> 16;

-      const uint8* src = src_argb + yi * src_stride;

+      const uint8_t* src = src_argb + yi * src_stride;

       if (filtering == kFilterLinear) {

         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);

       } else {

@@ -286,18 +365,25 @@

 // Scale ARGB up with bilinear interpolation.

-static void ScaleARGBBilinearUp(int src_width, int src_height,

-                                int dst_width, int dst_height,

-                                int src_stride, int dst_stride,

-                                const uint8* src_argb, uint8* dst_argb,

-                                int x, int dx, int y, int dy,

+static void ScaleARGBBilinearUp(int src_width,

+                                int src_height,

+                                int dst_width,

+                                int dst_height,

+                                int src_stride,

+                                int dst_stride,

+                                const uint8_t* src_argb,

+                                uint8_t* dst_argb,

+                                int x,

+                                int dx,

+                                int y,

+                                int dy,

                                 enum FilterMode filtering) {

   int j;

-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_C;

-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,

-      int dst_width, int x, int dx) =

+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_C;

+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,

+                              int dst_width, int x, int dx) =

       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;

   const int max_y = (src_height - 1) << 16;

 #if defined(HAS_INTERPOLATEROW_SSSE3)

@@ -324,15 +410,17 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

-    InterpolateRow = InterpolateRow_DSPR2;

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(dst_width, 8)) {

+      InterpolateRow = InterpolateRow_MSA;

+    }

 #endif

   if (src_width >= 32768) {

-    ScaleARGBFilterCols = filtering ?

-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;

+    ScaleARGBFilterCols =

+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;

 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)

   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

@@ -347,6 +435,14 @@

 #endif

+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)

+  if (filtering && TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 8)) {

+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;

+    }

+  }

+#endif

 #if defined(HAS_SCALEARGBCOLS_SSE2)

   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {

     ScaleARGBFilterCols = ScaleARGBCols_SSE2;

@@ -360,6 +456,14 @@

 #endif

+#if defined(HAS_SCALEARGBCOLS_MSA)

+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 4)) {

+      ScaleARGBFilterCols = ScaleARGBCols_MSA;

+    }

+  }

+#endif

   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {

     ScaleARGBFilterCols = ScaleARGBColsUp2_C;

 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)

@@ -375,13 +479,13 @@

     int yi = y >> 16;

-    const uint8* src = src_argb + yi * src_stride;

+    const uint8_t* src = src_argb + yi * src_stride;

     // Allocate 2 rows of ARGB.

     const int kRowSize = (dst_width * 4 + 31) & ~31;

     align_buffer_64(row, kRowSize * 2);

-    uint8* rowptr = row;

+    uint8_t* rowptr = row;

     int rowstride = kRowSize;

     int lasty = yi;

@@ -423,24 +527,27 @@

 #ifdef YUVSCALEUP

 // Scale YUV to ARGB up with bilinear interpolation.

-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,

-                                     int dst_width, int dst_height,

+static void ScaleYUVToARGBBilinearUp(int src_width,

+                                     int src_height,

+                                     int dst_width,

+                                     int dst_height,

                                      int src_stride_y,

                                      int src_stride_u,

                                      int src_stride_v,

                                      int dst_stride_argb,

-                                     const uint8* src_y,

-                                     const uint8* src_u,

-                                     const uint8* src_v,

-                                     uint8* dst_argb,

-                                     int x, int dx, int y, int dy,

+                                     const uint8_t* src_y,

+                                     const uint8_t* src_u,

+                                     const uint8_t* src_v,

+                                     uint8_t* dst_argb,

+                                     int x,

+                                     int dx,

+                                     int y,

+                                     int dy,

                                      enum FilterMode filtering) {

   int j;

-  void (*I422ToARGBRow)(const uint8* y_buf,

-                        const uint8* u_buf,

-                        const uint8* v_buf,

-                        uint8* rgb_buf,

-                        int width) = I422ToARGBRow_C;

+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,

+                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =

+      I422ToARGBRow_C;

 #if defined(HAS_I422TOARGBROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;

@@ -465,19 +572,18 @@

 #endif

-#if defined(HAS_I422TOARGBROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&

-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&

-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&

-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    I422ToARGBRow = I422ToARGBRow_DSPR2;

+#if defined(HAS_I422TOARGBROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    I422ToARGBRow = I422ToARGBRow_Any_MSA;

+    if (IS_ALIGNED(src_width, 8)) {

+      I422ToARGBRow = I422ToARGBRow_MSA;

+    }

 #endif

-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_C;

+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_C;

 #if defined(HAS_INTERPOLATEROW_SSSE3)

   if (TestCpuFlag(kCpuHasSSSE3)) {

     InterpolateRow = InterpolateRow_Any_SSSE3;

@@ -502,19 +608,21 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {

-    InterpolateRow = InterpolateRow_DSPR2;

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(dst_width, 8)) {

+      InterpolateRow = InterpolateRow_MSA;

+    }

 #endif

-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,

-      int dst_width, int x, int dx) =

+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,

+                              int dst_width, int x, int dx) =

       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;

   if (src_width >= 32768) {

-    ScaleARGBFilterCols = filtering ?

-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;

+    ScaleARGBFilterCols =

+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;

 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)

   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {

@@ -529,6 +637,14 @@

 #endif

+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)

+  if (filtering && TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 8)) {

+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;

+    }

+  }

+#endif

 #if defined(HAS_SCALEARGBCOLS_SSE2)

   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {

     ScaleARGBFilterCols = ScaleARGBCols_SSE2;

@@ -542,6 +658,14 @@

 #endif

+#if defined(HAS_SCALEARGBCOLS_MSA)

+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 4)) {

+      ScaleARGBFilterCols = ScaleARGBCols_MSA;

+    }

+  }

+#endif

   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {

     ScaleARGBFilterCols = ScaleARGBColsUp2_C;

 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)

@@ -558,9 +682,9 @@

   const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.

   int yi = y >> 16;

   int uv_yi = yi >> kYShift;

-  const uint8* src_row_y = src_y + yi * src_stride_y;

-  const uint8* src_row_u = src_u + uv_yi * src_stride_u;

-  const uint8* src_row_v = src_v + uv_yi * src_stride_v;

+  const uint8_t* src_row_y = src_y + yi * src_stride_y;

+  const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;

+  const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;

   // Allocate 2 rows of ARGB.

   const int kRowSize = (dst_width * 4 + 31) & ~31;

@@ -569,7 +693,7 @@

   // Allocate 1 row of ARGB for source conversion.

   align_buffer_64(argb_row, src_width * 4);

-  uint8* rowptr = row;

+  uint8_t* rowptr = row;

   int rowstride = kRowSize;

   int lasty = yi;

@@ -635,15 +759,23 @@

 // of x and dx is the integer part of the source position and

 // the lower 16 bits are the fixed decimal part.

-static void ScaleARGBSimple(int src_width, int src_height,

-                            int dst_width, int dst_height,

-                            int src_stride, int dst_stride,

-                            const uint8* src_argb, uint8* dst_argb,

-                            int x, int dx, int y, int dy) {

+static void ScaleARGBSimple(int src_width,

+                            int src_height,

+                            int dst_width,

+                            int dst_height,

+                            int src_stride,

+                            int dst_stride,

+                            const uint8_t* src_argb,

+                            uint8_t* dst_argb,

+                            int x,

+                            int dx,

+                            int y,

+                            int dy) {

   int j;

-  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,

-      int dst_width, int x, int dx) =

+  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,

+                        int dst_width, int x, int dx) =

       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;

+  (void)src_height;

 #if defined(HAS_SCALEARGBCOLS_SSE2)

   if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {

     ScaleARGBCols = ScaleARGBCols_SSE2;

@@ -657,6 +789,14 @@

 #endif

+#if defined(HAS_SCALEARGBCOLS_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    ScaleARGBCols = ScaleARGBCols_Any_MSA;

+    if (IS_ALIGNED(dst_width, 4)) {

+      ScaleARGBCols = ScaleARGBCols_MSA;

+    }

+  }

+#endif

   if (src_width * 2 == dst_width && x < 0x8000) {

     ScaleARGBCols = ScaleARGBColsUp2_C;

 #if defined(HAS_SCALEARGBCOLSUP2_SSE2)

@@ -667,8 +807,8 @@

   for (j = 0; j < dst_height; ++j) {

-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,

-                  dst_width, x, dx);

+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,

+                  dx);

     dst_argb += dst_stride;

     y += dy;

@@ -677,11 +817,18 @@

 // ScaleARGB a ARGB.

 // This function in turn calls a scaling function

 // suitable for handling the desired resolutions.

-static void ScaleARGB(const uint8* src, int src_stride,

-                      int src_width, int src_height,

-                      uint8* dst, int dst_stride,

-                      int dst_width, int dst_height,

-                      int clip_x, int clip_y, int clip_width, int clip_height,

+static void ScaleARGB(const uint8_t* src,

+                      int src_stride,

+                      int src_width,

+                      int src_height,

+                      uint8_t* dst,

+                      int dst_stride,

+                      int dst_width,

+                      int dst_height,

+                      int clip_x,

+                      int clip_y,

+                      int clip_width,

+                      int clip_height,

                       enum FilterMode filtering) {

   // Initial source x/y coordinate and step values as 16.16 fixed point.

   int x = 0;

@@ -690,8 +837,7 @@

   int dy = 0;

   // ARGB does not support box filter yet, but allow the user to pass it.

   // Simplify filtering when possible.

-  filtering = ScaleFilterReduce(src_width, src_height,

-                                dst_width, dst_height,

+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,

                                 filtering);

   // Negative src_height means invert the image.

@@ -700,17 +846,17 @@

     src = src + (src_height - 1) * src_stride;

     src_stride = -src_stride;

-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,

-             &x, &y, &dx, &dy);

+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,

+             &dx, &dy);

   src_width = Abs(src_width);

   if (clip_x) {

-    int64 clipf = (int64)(clip_x) * dx;

+    int64_t clipf = (int64_t)(clip_x)*dx;

     x += (clipf & 0xffff);

     src += (clipf >> 16) * 4;

     dst += clip_x * 4;

   if (clip_y) {

-    int64 clipf = (int64)(clip_y) * dy;

+    int64_t clipf = (int64_t)(clip_y)*dy;

     y += (clipf & 0xffff);

     src += (clipf >> 16) * src_stride;

     dst += clip_y * dst_stride;

@@ -725,24 +871,20 @@

       if (!(dx & 0x10000) && !(dy & 0x10000)) {

         if (dx == 0x20000) {

           // Optimized 1/2 downsample.

-          ScaleARGBDown2(src_width, src_height,

-                         clip_width, clip_height,

-                         src_stride, dst_stride, src, dst,

-                         x, dx, y, dy, filtering);

+          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,

+                         src_stride, dst_stride, src, dst, x, dx, y, dy,

+                         filtering);

           return;

         if (dx == 0x40000 && filtering == kFilterBox) {

           // Optimized 1/4 box downsample.

-          ScaleARGBDown4Box(src_width, src_height,

-                            clip_width, clip_height,

-                            src_stride, dst_stride, src, dst,

-                            x, dx, y, dy);

+          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,

+                            src_stride, dst_stride, src, dst, x, dx, y, dy);

           return;

-        ScaleARGBDownEven(src_width, src_height,

-                          clip_width, clip_height,

-                          src_stride, dst_stride, src, dst,

-                          x, dx, y, dy, filtering);

+        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,

+                          src_stride, dst_stride, src, dst, x, dx, y, dy,

+                          filtering);

         return;

       // Optimized odd scale down. ie 3, 5, 7, 9x.

@@ -759,96 +901,105 @@

   if (dx == 0x10000 && (x & 0xffff) == 0) {

     // Arbitrary scale vertically, but unscaled vertically.

-    ScalePlaneVertical(src_height,

-                       clip_width, clip_height,

-                       src_stride, dst_stride, src, dst,

-                       x, y, dy, 4, filtering);

+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,

+                       dst_stride, src, dst, x, y, dy, 4, filtering);

     return;

   if (filtering && dy < 65536) {

-    ScaleARGBBilinearUp(src_width, src_height,

-                        clip_width, clip_height,

-                        src_stride, dst_stride, src, dst,

-                        x, dx, y, dy, filtering);

+    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,

+                        src_stride, dst_stride, src, dst, x, dx, y, dy,

+                        filtering);

     return;

   if (filtering) {

-    ScaleARGBBilinearDown(src_width, src_height,

-                          clip_width, clip_height,

-                          src_stride, dst_stride, src, dst,

-                          x, dx, y, dy, filtering);

+    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,

+                          src_stride, dst_stride, src, dst, x, dx, y, dy,

+                          filtering);

     return;

-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,

-                  src_stride, dst_stride, src, dst,

-                  x, dx, y, dy);

+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,

+                  dst_stride, src, dst, x, dx, y, dy);

 LIBYUV_API

-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,

-                  int src_width, int src_height,

-                  uint8* dst_argb, int dst_stride_argb,

-                  int dst_width, int dst_height,

-                  int clip_x, int clip_y, int clip_width, int clip_height,

+int ARGBScaleClip(const uint8_t* src_argb,

+                  int src_stride_argb,

+                  int src_width,

+                  int src_height,

+                  uint8_t* dst_argb,

+                  int dst_stride_argb,

+                  int dst_width,

+                  int dst_height,

+                  int clip_x,

+                  int clip_y,

+                  int clip_width,

+                  int clip_height,

                   enum FilterMode filtering) {

-  if (!src_argb || src_width == 0 || src_height == 0 ||

-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||

-      clip_x < 0 || clip_y < 0 ||

+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||

+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||

       clip_width > 32768 || clip_height > 32768 ||

       (clip_x + clip_width) > dst_width ||

       (clip_y + clip_height) > dst_height) {

     return -1;

-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,

-            dst_argb, dst_stride_argb, dst_width, dst_height,

-            clip_x, clip_y, clip_width, clip_height, filtering);

+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,

+            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,

+            clip_height, filtering);

   return 0;

 // Scale an ARGB image.

 LIBYUV_API

-int ARGBScale(const uint8* src_argb, int src_stride_argb,

-              int src_width, int src_height,

-              uint8* dst_argb, int dst_stride_argb,

-              int dst_width, int dst_height,

+int ARGBScale(const uint8_t* src_argb,

+              int src_stride_argb,

+              int src_width,

+              int src_height,

+              uint8_t* dst_argb,

+              int dst_stride_argb,

+              int dst_width,

+              int dst_height,

               enum FilterMode filtering) {

-  if (!src_argb || src_width == 0 || src_height == 0 ||

-      src_width > 32768 || src_height > 32768 ||

-      !dst_argb || dst_width <= 0 || dst_height <= 0) {

+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||

+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {

     return -1;

-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,

-            dst_argb, dst_stride_argb, dst_width, dst_height,

-            0, 0, dst_width, dst_height, filtering);

+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,

+            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,

+            filtering);

   return 0;

 // Scale with YUV conversion to ARGB and clipping.

 LIBYUV_API

-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,

-                       const uint8* src_u, int src_stride_u,

-                       const uint8* src_v, int src_stride_v,

-                       uint32 src_fourcc,

-                       int src_width, int src_height,

-                       uint8* dst_argb, int dst_stride_argb,

-                       uint32 dst_fourcc,

-                       int dst_width, int dst_height,

-                       int clip_x, int clip_y, int clip_width, int clip_height,

+int YUVToARGBScaleClip(const uint8_t* src_y,

+                       int src_stride_y,

+                       const uint8_t* src_u,

+                       int src_stride_u,

+                       const uint8_t* src_v,

+                       int src_stride_v,

+                       uint32_t src_fourcc,

+                       int src_width,

+                       int src_height,

+                       uint8_t* dst_argb,

+                       int dst_stride_argb,

+                       uint32_t dst_fourcc,

+                       int dst_width,

+                       int dst_height,

+                       int clip_x,

+                       int clip_y,

+                       int clip_width,

+                       int clip_height,

                        enum FilterMode filtering) {

-  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);

+  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);

   int r;

-  I420ToARGB(src_y, src_stride_y,

-             src_u, src_stride_u,

-             src_v, src_stride_v,

-             argb_buffer, src_width * 4,

-             src_width, src_height);

+  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.

+  (void)dst_fourcc;

+  I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,

+             argb_buffer, src_width * 4, src_width, src_height);

-  r = ARGBScaleClip(argb_buffer, src_width * 4,

-                    src_width, src_height,

-                    dst_argb, dst_stride_argb,

-                    dst_width, dst_height,

-                    clip_x, clip_y, clip_width, clip_height,

-                    filtering);

+  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,

+                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,

+                    clip_width, clip_height, filtering);

   free(argb_buffer);

   return r;

--- a/third_party/libyuv/source/scale_common.cc

+++ b/third_party/libyuv/source/scale_common.cc

@@ -28,9 +28,12 @@

 // CPU agnostic row functions

-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                     uint8* dst, int dst_width) {

+void ScaleRowDown2_C(const uint8_t* src_ptr,

+                     ptrdiff_t src_stride,

+                     uint8_t* dst,

+                     int dst_width) {

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = src_ptr[1];

     dst[1] = src_ptr[3];

@@ -42,9 +45,12 @@

-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                        uint16* dst, int dst_width) {

+void ScaleRowDown2_16_C(const uint16_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint16_t* dst,

+                        int dst_width) {

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = src_ptr[1];

     dst[1] = src_ptr[3];

@@ -56,10 +62,13 @@

-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst, int dst_width) {

-  const uint8* s = src_ptr;

+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst,

+                           int dst_width) {

+  const uint8_t* s = src_ptr;

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = (s[0] + s[1] + 1) >> 1;

     dst[1] = (s[2] + s[3] + 1) >> 1;

@@ -71,10 +80,13 @@

-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                              uint16* dst, int dst_width) {

-  const uint16* s = src_ptr;

+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint16_t* dst,

+                              int dst_width) {

+  const uint16_t* s = src_ptr;

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = (s[0] + s[1] + 1) >> 1;

     dst[1] = (s[2] + s[3] + 1) >> 1;

@@ -86,10 +98,12 @@

-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst, int dst_width) {

-  const uint8* s = src_ptr;

-  const uint8* t = src_ptr + src_stride;

+void ScaleRowDown2Box_C(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width) {

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

   int x;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

@@ -103,10 +117,12 @@

-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width) {

-  const uint8* s = src_ptr;

-  const uint8* t = src_ptr + src_stride;

+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst,

+                            int dst_width) {

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

   int x;

   dst_width -= 1;

   for (x = 0; x < dst_width - 1; x += 2) {

@@ -125,10 +141,12 @@

   dst[0] = (s[0] + t[0] + 1) >> 1;

-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst, int dst_width) {

-  const uint16* s = src_ptr;

-  const uint16* t = src_ptr + src_stride;

+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint16_t* dst,

+                           int dst_width) {

+  const uint16_t* s = src_ptr;

+  const uint16_t* t = src_ptr + src_stride;

   int x;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;

@@ -142,9 +160,12 @@

-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                     uint8* dst, int dst_width) {

+void ScaleRowDown4_C(const uint8_t* src_ptr,

+                     ptrdiff_t src_stride,

+                     uint8_t* dst,

+                     int dst_width) {

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = src_ptr[2];

     dst[1] = src_ptr[6];

@@ -156,9 +177,12 @@

-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                        uint16* dst, int dst_width) {

+void ScaleRowDown4_16_C(const uint16_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint16_t* dst,

+                        int dst_width) {

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = src_ptr[2];

     dst[1] = src_ptr[6];

@@ -170,81 +194,88 @@

-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst, int dst_width) {

+void ScaleRowDown4Box_C(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width) {

   intptr_t stride = src_stride;

   int x;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

-             src_ptr[stride + 0] + src_ptr[stride + 1] +

-             src_ptr[stride + 2] + src_ptr[stride + 3] +

-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

-             8) >> 4;

+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +

+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +

+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +

+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +

+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +

+              src_ptr[stride * 3 + 3] + 8) >>

+             4;

     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +

-             src_ptr[stride + 4] + src_ptr[stride + 5] +

-             src_ptr[stride + 6] + src_ptr[stride + 7] +

-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +

-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +

-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +

-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +

-             8) >> 4;

+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +

+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +

+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +

+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +

+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +

+              src_ptr[stride * 3 + 7] + 8) >>

+             4;

     dst += 2;

     src_ptr += 8;

   if (dst_width & 1) {

     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

-             src_ptr[stride + 0] + src_ptr[stride + 1] +

-             src_ptr[stride + 2] + src_ptr[stride + 3] +

-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

-             8) >> 4;

+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +

+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +

+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +

+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +

+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +

+              src_ptr[stride * 3 + 3] + 8) >>

+             4;

-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                           uint16* dst, int dst_width) {

+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint16_t* dst,

+                           int dst_width) {

   intptr_t stride = src_stride;

   int x;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

-             src_ptr[stride + 0] + src_ptr[stride + 1] +

-             src_ptr[stride + 2] + src_ptr[stride + 3] +

-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

-             8) >> 4;

+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +

+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +

+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +

+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +

+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +

+              src_ptr[stride * 3 + 3] + 8) >>

+             4;

     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +

-             src_ptr[stride + 4] + src_ptr[stride + 5] +

-             src_ptr[stride + 6] + src_ptr[stride + 7] +

-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +

-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +

-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +

-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +

-             8) >> 4;

+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +

+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +

+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +

+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +

+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +

+              src_ptr[stride * 3 + 7] + 8) >>

+             4;

     dst += 2;

     src_ptr += 8;

   if (dst_width & 1) {

     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +

-             src_ptr[stride + 0] + src_ptr[stride + 1] +

-             src_ptr[stride + 2] + src_ptr[stride + 3] +

-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +

-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +

-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +

-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +

-             8) >> 4;

+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +

+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +

+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +

+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +

+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +

+              src_ptr[stride * 3 + 3] + 8) >>

+             4;

-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                      uint8* dst, int dst_width) {

+void ScaleRowDown34_C(const uint8_t* src_ptr,

+                      ptrdiff_t src_stride,

+                      uint8_t* dst,

+                      int dst_width) {

   int x;

+  (void)src_stride;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (x = 0; x < dst_width; x += 3) {

     dst[0] = src_ptr[0];

@@ -255,9 +286,12 @@

-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                         uint16* dst, int dst_width) {

+void ScaleRowDown34_16_C(const uint16_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint16_t* dst,

+                         int dst_width) {

   int x;

+  (void)src_stride;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (x = 0; x < dst_width; x += 3) {

     dst[0] = src_ptr[0];

@@ -269,19 +303,21 @@

 // Filter rows 0 and 1 together, 3 : 1

-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* d, int dst_width) {

-  const uint8* s = src_ptr;

-  const uint8* t = src_ptr + src_stride;

+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* d,

+                            int dst_width) {

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

   int x;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (x = 0; x < dst_width; x += 3) {

-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

     d[0] = (a0 * 3 + b0 + 2) >> 2;

     d[1] = (a1 * 3 + b1 + 2) >> 2;

     d[2] = (a2 * 3 + b2 + 2) >> 2;

@@ -291,19 +327,21 @@

-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                               uint16* d, int dst_width) {

-  const uint16* s = src_ptr;

-  const uint16* t = src_ptr + src_stride;

+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint16_t* d,

+                               int dst_width) {

+  const uint16_t* s = src_ptr;

+  const uint16_t* t = src_ptr + src_stride;

   int x;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (x = 0; x < dst_width; x += 3) {

-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

     d[0] = (a0 * 3 + b0 + 2) >> 2;

     d[1] = (a1 * 3 + b1 + 2) >> 2;

     d[2] = (a2 * 3 + b2 + 2) >> 2;

@@ -314,19 +352,21 @@

 // Filter rows 1 and 2 together, 1 : 1

-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* d, int dst_width) {

-  const uint8* s = src_ptr;

-  const uint8* t = src_ptr + src_stride;

+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* d,

+                            int dst_width) {

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

   int x;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (x = 0; x < dst_width; x += 3) {

-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

     d[0] = (a0 + b0 + 1) >> 1;

     d[1] = (a1 + b1 + 1) >> 1;

     d[2] = (a2 + b2 + 1) >> 1;

@@ -336,19 +376,21 @@

-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                               uint16* d, int dst_width) {

-  const uint16* s = src_ptr;

-  const uint16* t = src_ptr + src_stride;

+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint16_t* d,

+                               int dst_width) {

+  const uint16_t* s = src_ptr;

+  const uint16_t* t = src_ptr + src_stride;

   int x;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (x = 0; x < dst_width; x += 3) {

-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;

+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;

+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;

+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;

+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;

+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;

     d[0] = (a0 + b0 + 1) >> 1;

     d[1] = (a1 + b1 + 1) >> 1;

     d[2] = (a2 + b2 + 1) >> 1;

@@ -359,8 +401,11 @@

 // Scales a single row of pixels using point sampling.

-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,

-                 int dst_width, int x, int dx) {

+void ScaleCols_C(uint8_t* dst_ptr,

+                 const uint8_t* src_ptr,

+                 int dst_width,

+                 int x,

+                 int dx) {

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

     dst_ptr[0] = src_ptr[x >> 16];

@@ -374,8 +419,11 @@

-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                    int dst_width, int x, int dx) {

+void ScaleCols_16_C(uint16_t* dst_ptr,

+                    const uint16_t* src_ptr,

+                    int dst_width,

+                    int x,

+                    int dx) {

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

     dst_ptr[0] = src_ptr[x >> 16];

@@ -390,9 +438,14 @@

 // Scales a single row of pixels up by 2x using point sampling.

-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,

-                    int dst_width, int x, int dx) {

+void ScaleColsUp2_C(uint8_t* dst_ptr,

+                    const uint8_t* src_ptr,

+                    int dst_width,

+                    int x,

+                    int dx) {

   int j;

+  (void)x;

+  (void)dx;

   for (j = 0; j < dst_width - 1; j += 2) {

     dst_ptr[1] = dst_ptr[0] = src_ptr[0];

     src_ptr += 1;

@@ -403,9 +456,14 @@

-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                       int dst_width, int x, int dx) {

+void ScaleColsUp2_16_C(uint16_t* dst_ptr,

+                       const uint16_t* src_ptr,

+                       int dst_width,

+                       int x,

+                       int dx) {

   int j;

+  (void)x;

+  (void)dx;

   for (j = 0; j < dst_width - 1; j += 2) {

     dst_ptr[1] = dst_ptr[0] = src_ptr[0];

     src_ptr += 1;

@@ -418,16 +476,19 @@

 // (1-f)a + fb can be replaced with a + f(b-a)

 #if defined(__arm__) || defined(__aarch64__)

-#define BLENDER(a, b, f) (uint8)((int)(a) + \

-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))

+#define BLENDER(a, b, f) \

+  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))

 #else

-// inteluses 7 bit math with rounding.

-#define BLENDER(a, b, f) (uint8)((int)(a) + \

-    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))

+// Intel uses 7 bit math with rounding.

+#define BLENDER(a, b, f) \

+  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))

 #endif

-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,

-                       int dst_width, int x, int dx) {

+void ScaleFilterCols_C(uint8_t* dst_ptr,

+                       const uint8_t* src_ptr,

+                       int dst_width,

+                       int x,

+                       int dx) {

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

     int xi = x >> 16;

@@ -450,12 +511,15 @@

-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,

-                         int dst_width, int x32, int dx) {

-  int64 x = (int64)(x32);

+void ScaleFilterCols64_C(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         int dst_width,

+                         int x32,

+                         int dx) {

+  int64_t x = (int64_t)(x32);

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

-    int64 xi = x >> 16;

+    int64_t xi = x >> 16;

     int a = src_ptr[xi];

     int b = src_ptr[xi + 1];

     dst_ptr[0] = BLENDER(a, b, x & 0xffff);

@@ -468,7 +532,7 @@

     dst_ptr += 2;

   if (dst_width & 1) {

-    int64 xi = x >> 16;

+    int64_t xi = x >> 16;

     int a = src_ptr[xi];

     int b = src_ptr[xi + 1];

     dst_ptr[0] = BLENDER(a, b, x & 0xffff);

@@ -476,12 +540,15 @@

 #undef BLENDER

-// Same as 8 bit arm blender but return is cast to uint16

-#define BLENDER(a, b, f) (uint16)((int)(a) + \

-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))

+// Same as 8 bit arm blender but return is cast to uint16_t

+#define BLENDER(a, b, f) \

+  (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))

-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                       int dst_width, int x, int dx) {

+void ScaleFilterCols_16_C(uint16_t* dst_ptr,

+                          const uint16_t* src_ptr,

+                          int dst_width,

+                          int x,

+                          int dx) {

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

     int xi = x >> 16;

@@ -504,12 +571,15 @@

-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,

-                         int dst_width, int x32, int dx) {

-  int64 x = (int64)(x32);

+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,

+                            const uint16_t* src_ptr,

+                            int dst_width,

+                            int x32,

+                            int dx) {

+  int64_t x = (int64_t)(x32);

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

-    int64 xi = x >> 16;

+    int64_t xi = x >> 16;

     int a = src_ptr[xi];

     int b = src_ptr[xi + 1];

     dst_ptr[0] = BLENDER(a, b, x & 0xffff);

@@ -522,7 +592,7 @@

     dst_ptr += 2;

   if (dst_width & 1) {

-    int64 xi = x >> 16;

+    int64_t xi = x >> 16;

     int a = src_ptr[xi];

     int b = src_ptr[xi + 1];

     dst_ptr[0] = BLENDER(a, b, x & 0xffff);

@@ -530,9 +600,12 @@

 #undef BLENDER

-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                      uint8* dst, int dst_width) {

+void ScaleRowDown38_C(const uint8_t* src_ptr,

+                      ptrdiff_t src_stride,

+                      uint8_t* dst,

+                      int dst_width) {

   int x;

+  (void)src_stride;

   assert(dst_width % 3 == 0);

   for (x = 0; x < dst_width; x += 3) {

     dst[0] = src_ptr[0];

@@ -543,9 +616,12 @@

-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                         uint16* dst, int dst_width) {

+void ScaleRowDown38_16_C(const uint16_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint16_t* dst,

+                         int dst_width) {

   int x;

+  (void)src_stride;

   assert(dst_width % 3 == 0);

   for (x = 0; x < dst_width; x += 3) {

     dst[0] = src_ptr[0];

@@ -557,53 +633,61 @@

 // 8x3 -> 3x1

-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,

+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,

                             ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width) {

+                            uint8_t* dst_ptr,

+                            int dst_width) {

   intptr_t stride = src_stride;

   int i;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (i = 0; i < dst_width; i += 3) {

-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

-        src_ptr[stride + 0] + src_ptr[stride + 1] +

-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +

-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *

-        (65536 / 9) >> 16;

-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

-        src_ptr[stride + 3] + src_ptr[stride + 4] +

-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +

-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *

-        (65536 / 9) >> 16;

-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

-        src_ptr[stride + 6] + src_ptr[stride + 7] +

-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *

-        (65536 / 6) >> 16;

+    dst_ptr[0] =

+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +

+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +

+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *

+            (65536 / 9) >>

+        16;

+    dst_ptr[1] =

+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +

+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +

+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *

+            (65536 / 9) >>

+        16;

+    dst_ptr[2] =

+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +

+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *

+            (65536 / 6) >>

+        16;

     src_ptr += 8;

     dst_ptr += 3;

-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,

+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint16* dst_ptr, int dst_width) {

+                               uint16_t* dst_ptr,

+                               int dst_width) {

   intptr_t stride = src_stride;

   int i;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (i = 0; i < dst_width; i += 3) {

-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

-        src_ptr[stride + 0] + src_ptr[stride + 1] +

-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +

-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *

-        (65536 / 9) >> 16;

-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

-        src_ptr[stride + 3] + src_ptr[stride + 4] +

-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +

-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *

-        (65536 / 9) >> 16;

-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

-        src_ptr[stride + 6] + src_ptr[stride + 7] +

-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *

-        (65536 / 6) >> 16;

+    dst_ptr[0] =

+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +

+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +

+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *

+            (65536 / 9) >>

+        16;

+    dst_ptr[1] =

+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +

+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +

+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *

+            (65536 / 9) >>

+        16;

+    dst_ptr[2] =

+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +

+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *

+            (65536 / 6) >>

+        16;

     src_ptr += 8;

     dst_ptr += 3;

@@ -610,47 +694,57 @@

 // 8x2 -> 3x1

-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width) {

+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width) {

   intptr_t stride = src_stride;

   int i;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (i = 0; i < dst_width; i += 3) {

-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

-        src_ptr[stride + 0] + src_ptr[stride + 1] +

-        src_ptr[stride + 2]) * (65536 / 6) >> 16;

-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

-        src_ptr[stride + 3] + src_ptr[stride + 4] +

-        src_ptr[stride + 5]) * (65536 / 6) >> 16;

-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

-        src_ptr[stride + 6] + src_ptr[stride + 7]) *

-        (65536 / 4) >> 16;

+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +

+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *

+                     (65536 / 6) >>

+                 16;

+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +

+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *

+                     (65536 / 6) >>

+                 16;

+    dst_ptr[2] =

+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *

+            (65536 / 4) >>

+        16;

     src_ptr += 8;

     dst_ptr += 3;

-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,

-                               uint16* dst_ptr, int dst_width) {

+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint16_t* dst_ptr,

+                               int dst_width) {

   intptr_t stride = src_stride;

   int i;

   assert((dst_width % 3 == 0) && (dst_width > 0));

   for (i = 0; i < dst_width; i += 3) {

-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +

-        src_ptr[stride + 0] + src_ptr[stride + 1] +

-        src_ptr[stride + 2]) * (65536 / 6) >> 16;

-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +

-        src_ptr[stride + 3] + src_ptr[stride + 4] +

-        src_ptr[stride + 5]) * (65536 / 6) >> 16;

-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +

-        src_ptr[stride + 6] + src_ptr[stride + 7]) *

-        (65536 / 4) >> 16;

+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +

+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *

+                     (65536 / 6) >>

+                 16;

+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +

+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *

+                     (65536 / 6) >>

+                 16;

+    dst_ptr[2] =

+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *

+            (65536 / 4) >>

+        16;

     src_ptr += 8;

     dst_ptr += 3;

-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {

   int x;

   assert(src_width > 0);

   for (x = 0; x < src_width - 1; x += 2) {

@@ -664,7 +758,9 @@

-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {

+void ScaleAddRow_16_C(const uint16_t* src_ptr,

+                      uint32_t* dst_ptr,

+                      int src_width) {

   int x;

   assert(src_width > 0);

   for (x = 0; x < src_width - 1; x += 2) {

@@ -678,13 +774,14 @@

-void ScaleARGBRowDown2_C(const uint8* src_argb,

+void ScaleARGBRowDown2_C(const uint8_t* src_argb,

                          ptrdiff_t src_stride,

-                         uint8* dst_argb, int dst_width) {

-  const uint32* src = (const uint32*)(src_argb);

-  uint32* dst = (uint32*)(dst_argb);

+                         uint8_t* dst_argb,

+                         int dst_width) {

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = src[1];

     dst[1] = src[3];

@@ -696,10 +793,12 @@

-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,

+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,

                                ptrdiff_t src_stride,

-                               uint8* dst_argb, int dst_width) {

+                               uint8_t* dst_argb,

+                               int dst_width) {

   int x;

+  (void)src_stride;

   for (x = 0; x < dst_width; ++x) {

     dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;

     dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;

@@ -710,29 +809,37 @@

-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,

-                            uint8* dst_argb, int dst_width) {

+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_argb,

+                            int dst_width) {

   int x;

   for (x = 0; x < dst_width; ++x) {

-    dst_argb[0] = (src_argb[0] + src_argb[4] +

-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;

-    dst_argb[1] = (src_argb[1] + src_argb[5] +

-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;

-    dst_argb[2] = (src_argb[2] + src_argb[6] +

-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;

-    dst_argb[3] = (src_argb[3] + src_argb[7] +

-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;

+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +

+                   src_argb[src_stride + 4] + 2) >>

+                  2;

+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +

+                   src_argb[src_stride + 5] + 2) >>

+                  2;

+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +

+                   src_argb[src_stride + 6] + 2) >>

+                  2;

+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +

+                   src_argb[src_stride + 7] + 2) >>

+                  2;

     src_argb += 8;

     dst_argb += 4;

-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,

+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,

+                            ptrdiff_t src_stride,

                             int src_stepx,

-                            uint8* dst_argb, int dst_width) {

-  const uint32* src = (const uint32*)(src_argb);

-  uint32* dst = (uint32*)(dst_argb);

+                            uint8_t* dst_argb,

+                            int dst_width) {

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

+  (void)src_stride;

   int x;

   for (x = 0; x < dst_width - 1; x += 2) {

     dst[0] = src[0];

@@ -745,20 +852,25 @@

-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,

+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,

                                ptrdiff_t src_stride,

                                int src_stepx,

-                               uint8* dst_argb, int dst_width) {

+                               uint8_t* dst_argb,

+                               int dst_width) {

   int x;

   for (x = 0; x < dst_width; ++x) {

-    dst_argb[0] = (src_argb[0] + src_argb[4] +

-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;

-    dst_argb[1] = (src_argb[1] + src_argb[5] +

-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;

-    dst_argb[2] = (src_argb[2] + src_argb[6] +

-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;

-    dst_argb[3] = (src_argb[3] + src_argb[7] +

-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;

+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +

+                   src_argb[src_stride + 4] + 2) >>

+                  2;

+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +

+                   src_argb[src_stride + 5] + 2) >>

+                  2;

+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +

+                   src_argb[src_stride + 6] + 2) >>

+                  2;

+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +

+                   src_argb[src_stride + 7] + 2) >>

+                  2;

     src_argb += src_stepx * 4;

     dst_argb += 4;

@@ -765,10 +877,13 @@

 // Scales a single row of pixels using point sampling.

-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,

-                     int dst_width, int x, int dx) {

-  const uint32* src = (const uint32*)(src_argb);

-  uint32* dst = (uint32*)(dst_argb);

+void ScaleARGBCols_C(uint8_t* dst_argb,

+                     const uint8_t* src_argb,

+                     int dst_width,

+                     int x,

+                     int dx) {

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

     dst[0] = src[x >> 16];

@@ -782,11 +897,14 @@

-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,

-                       int dst_width, int x32, int dx) {

-  int64 x = (int64)(x32);

-  const uint32* src = (const uint32*)(src_argb);

-  uint32* dst = (uint32*)(dst_argb);

+void ScaleARGBCols64_C(uint8_t* dst_argb,

+                       const uint8_t* src_argb,

+                       int dst_width,

+                       int x32,

+                       int dx) {

+  int64_t x = (int64_t)(x32);

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

     dst[0] = src[x >> 16];

@@ -801,11 +919,16 @@

 // Scales a single row of pixels up by 2x using point sampling.

-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int x, int dx) {

-  const uint32* src = (const uint32*)(src_argb);

-  uint32* dst = (uint32*)(dst_argb);

+void ScaleARGBColsUp2_C(uint8_t* dst_argb,

+                        const uint8_t* src_argb,

+                        int dst_width,

+                        int x,

+                        int dx) {

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

   int j;

+  (void)x;

+  (void)dx;

   for (j = 0; j < dst_width - 1; j += 2) {

     dst[1] = dst[0] = src[0];

     src += 1;

@@ -818,23 +941,26 @@

 // TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.

 // Mimics SSSE3 blender

-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7

-#define BLENDERC(a, b, f, s) (uint32)( \

-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)

-#define BLENDER(a, b, f) \

-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \

-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)

+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7

+#define BLENDERC(a, b, f, s) \

+  (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)

+#define BLENDER(a, b, f)                                                 \

+  BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \

+      BLENDERC(a, b, f, 0)

-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,

-                           int dst_width, int x, int dx) {

-  const uint32* src = (const uint32*)(src_argb);

-  uint32* dst = (uint32*)(dst_argb);

+void ScaleARGBFilterCols_C(uint8_t* dst_argb,

+                           const uint8_t* src_argb,

+                           int dst_width,

+                           int x,

+                           int dx) {

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

     int xi = x >> 16;

     int xf = (x >> 9) & 0x7f;

-    uint32 a = src[xi];

-    uint32 b = src[xi + 1];

+    uint32_t a = src[xi];

+    uint32_t b = src[xi + 1];

     dst[0] = BLENDER(a, b, xf);

     x += dx;

     xi = x >> 16;

@@ -848,23 +974,26 @@

   if (dst_width & 1) {

     int xi = x >> 16;

     int xf = (x >> 9) & 0x7f;

-    uint32 a = src[xi];

-    uint32 b = src[xi + 1];

+    uint32_t a = src[xi];

+    uint32_t b = src[xi + 1];

     dst[0] = BLENDER(a, b, xf);

-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,

-                             int dst_width, int x32, int dx) {

-  int64 x = (int64)(x32);

-  const uint32* src = (const uint32*)(src_argb);

-  uint32* dst = (uint32*)(dst_argb);

+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,

+                             const uint8_t* src_argb,

+                             int dst_width,

+                             int x32,

+                             int dx) {

+  int64_t x = (int64_t)(x32);

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

   int j;

   for (j = 0; j < dst_width - 1; j += 2) {

-    int64 xi = x >> 16;

+    int64_t xi = x >> 16;

     int xf = (x >> 9) & 0x7f;

-    uint32 a = src[xi];

-    uint32 b = src[xi + 1];

+    uint32_t a = src[xi];

+    uint32_t b = src[xi + 1];

     dst[0] = BLENDER(a, b, xf);

     x += dx;

     xi = x >> 16;

@@ -876,10 +1005,10 @@

     dst += 2;

   if (dst_width & 1) {

-    int64 xi = x >> 16;

+    int64_t xi = x >> 16;

     int xf = (x >> 9) & 0x7f;

-    uint32 a = src[xi];

-    uint32 b = src[xi + 1];

+    uint32_t a = src[xi];

+    uint32_t b = src[xi + 1];

     dst[0] = BLENDER(a, b, xf);

@@ -889,16 +1018,22 @@

 // Scale plane vertically with bilinear interpolation.

 void ScalePlaneVertical(int src_height,

-                        int dst_width, int dst_height,

-                        int src_stride, int dst_stride,

-                        const uint8* src_argb, uint8* dst_argb,

-                        int x, int y, int dy,

-                        int bpp, enum FilterMode filtering) {

+                        int dst_width,

+                        int dst_height,

+                        int src_stride,

+                        int dst_stride,

+                        const uint8_t* src_argb,

+                        uint8_t* dst_argb,

+                        int x,

+                        int y,

+                        int dy,

+                        int bpp,

+                        enum FilterMode filtering) {

   // TODO(fbarchard): Allow higher bpp.

   int dst_width_bytes = dst_width * bpp;

-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_C;

+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_C;

   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;

   int j;

   assert(bpp >= 1 && bpp <= 4);

@@ -930,13 +1065,11 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

-    InterpolateRow = InterpolateRow_Any_DSPR2;

-    if (IS_ALIGNED(dst_width_bytes, 4)) {

-      InterpolateRow = InterpolateRow_DSPR2;

+#if defined(HAS_INTERPOLATEROW_MSA)

+  if (TestCpuFlag(kCpuHasMSA)) {

+    InterpolateRow = InterpolateRow_Any_MSA;

+    if (IS_ALIGNED(dst_width_bytes, 32)) {

+      InterpolateRow = InterpolateRow_MSA;

 #endif

@@ -948,23 +1081,29 @@

     yi = y >> 16;

     yf = filtering ? ((y >> 8) & 255) : 0;

-    InterpolateRow(dst_argb, src_argb + yi * src_stride,

-                   src_stride, dst_width_bytes, yf);

+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,

+                   dst_width_bytes, yf);

     dst_argb += dst_stride;

     y += dy;

 void ScalePlaneVertical_16(int src_height,

-                           int dst_width, int dst_height,

-                           int src_stride, int dst_stride,

-                           const uint16* src_argb, uint16* dst_argb,

-                           int x, int y, int dy,

-                           int wpp, enum FilterMode filtering) {

+                           int dst_width,

+                           int dst_height,

+                           int src_stride,

+                           int dst_stride,

+                           const uint16_t* src_argb,

+                           uint16_t* dst_argb,

+                           int x,

+                           int y,

+                           int dy,

+                           int wpp,

+                           enum FilterMode filtering) {

   // TODO(fbarchard): Allow higher wpp.

   int dst_width_words = dst_width * wpp;

-  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,

-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =

-      InterpolateRow_16_C;

+  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,

+                         ptrdiff_t src_stride, int dst_width,

+                         int source_y_fraction) = InterpolateRow_16_C;

   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;

   int j;

   assert(wpp >= 1 && wpp <= 2);

@@ -1004,16 +1143,6 @@

 #endif

-#if defined(HAS_INTERPOLATEROW_16_DSPR2)

-  if (TestCpuFlag(kCpuHasDSPR2) &&

-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&

-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {

-    InterpolateRow = InterpolateRow_Any_16_DSPR2;

-    if (IS_ALIGNED(dst_width_bytes, 4)) {

-      InterpolateRow = InterpolateRow_16_DSPR2;

-    }

-  }

-#endif

   for (j = 0; j < dst_height; ++j) {

     int yi;

     int yf;

@@ -1022,8 +1151,8 @@

     yi = y >> 16;

     yf = filtering ? ((y >> 8) & 255) : 0;

-    InterpolateRow(dst_argb, src_argb + yi * src_stride,

-                   src_stride, dst_width_words, yf);

+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,

+                   dst_width_words, yf);

     dst_argb += dst_stride;

     y += dy;

@@ -1030,8 +1159,10 @@

 // Simplify the filtering based on scale factors.

-enum FilterMode ScaleFilterReduce(int src_width, int src_height,

-                                  int dst_width, int dst_height,

+enum FilterMode ScaleFilterReduce(int src_width,

+                                  int src_height,

+                                  int dst_width,

+                                  int dst_height,

                                   enum FilterMode filtering) {

   if (src_width < 0) {

     src_width = -src_width;

@@ -1073,22 +1204,26 @@

 // Divide num by div and return as 16.16 fixed point result.

 int FixedDiv_C(int num, int div) {

-  return (int)(((int64)(num) << 16) / div);

+  return (int)(((int64_t)(num) << 16) / div);

 // Divide num by div and return as 16.16 fixed point result.

 int FixedDiv1_C(int num, int div) {

-  return (int)((((int64)(num) << 16) - 0x00010001) /

-                          (div - 1));

+  return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));

 #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)

 // Compute slope values for stepping.

-void ScaleSlope(int src_width, int src_height,

-                int dst_width, int dst_height,

+void ScaleSlope(int src_width,

+                int src_height,

+                int dst_width,

+                int dst_height,

                 enum FilterMode filtering,

-                int* x, int* y, int* dx, int* dy) {

+                int* x,

+                int* y,

+                int* dx,

+                int* dy) {

   assert(x != NULL);

   assert(y != NULL);

   assert(dx != NULL);

@@ -1120,7 +1255,7 @@

       *x = 0;

     if (dst_height <= src_height) {

-      *dy = FixedDiv(src_height,  dst_height);

+      *dy = FixedDiv(src_height, dst_height);

       *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.

     } else if (dst_height > 1) {

       *dy = FixedDiv1(src_height, dst_height);

@@ -1152,6 +1287,35 @@

 #undef CENTERSTART

+// Read 8x2 upsample with filtering and write 16x1.

+// actually reads an extra pixel, so 9x2.

+void ScaleRowUp2_16_C(const uint16_t* src_ptr,

+                      ptrdiff_t src_stride,

+                      uint16_t* dst,

+                      int dst_width) {

+  const uint16_t* src2 = src_ptr + src_stride;

+  int x;

+  for (x = 0; x < dst_width - 1; x += 2) {

+    uint16_t p0 = src_ptr[0];

+    uint16_t p1 = src_ptr[1];

+    uint16_t p2 = src2[0];

+    uint16_t p3 = src2[1];

+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;

+    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;

+    ++src_ptr;

+    ++src2;

+    dst += 2;

+  }

+  if (dst_width & 1) {

+    uint16_t p0 = src_ptr[0];

+    uint16_t p1 = src_ptr[1];

+    uint16_t p2 = src2[0];

+    uint16_t p3 = src2[1];

+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;

+  }

+}

 #ifdef __cplusplus

 }  // extern "C"

--- a/third_party/libyuv/source/scale_gcc.cc

+++ b/third_party/libyuv/source/scale_gcc.cc

@@ -21,1296 +21,1348 @@

     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))

 // Offsets for source bytes 0 to 9

-static uvec8 kShuf0 =

-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,

+                             128, 128, 128, 128, 128, 128, 128, 128};

 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

-static uvec8 kShuf1 =

-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,

+                             128, 128, 128, 128, 128, 128, 128, 128};

 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

-static uvec8 kShuf2 =

-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,

+                             128, 128, 128, 128, 128, 128, 128, 128};

 // Offsets for source bytes 0 to 10

-static uvec8 kShuf01 =

-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};

 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

-static uvec8 kShuf11 =

-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,

+                              8, 9, 9, 10, 10, 11, 12, 13};

 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

-static uvec8 kShuf21 =

-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,

+                              10, 11, 12, 13, 13, 14, 14, 15};

 // Coefficients for source bytes 0 to 10

-static uvec8 kMadd01 =

-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};

 // Coefficients for source bytes 10 to 21

-static uvec8 kMadd11 =

-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};

 // Coefficients for source bytes 21 to 31

-static uvec8 kMadd21 =

-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};

 // Coefficients for source bytes 21 to 31

-static vec16 kRound34 =

-  { 2, 2, 2, 2, 2, 2, 2, 2 };

+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};

-static uvec8 kShuf38a =

-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,

+                               128, 128, 128, 128, 128, 128, 128, 128};

-static uvec8 kShuf38b =

-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,

+                               6,   8,   11,  14,  128, 128, 128, 128};

 // Arrange words 0,3,6 into 0,1,2

-static uvec8 kShufAc =

-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,

+                              128, 128, 128, 128, 128, 128, 128, 128};

 // Arrange words 0,3,6 into 3,4,5

-static uvec8 kShufAc3 =

-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,

+                               6,   7,   12,  13,  128, 128, 128, 128};

 // Scaling values for boxes of 3x3 and 2x3

-static uvec16 kScaleAc33 =

-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,

+                                  65536 / 9, 65536 / 6, 0,         0};

 // Arrange first value for pixels 0,1,2,3,4,5

-static uvec8 kShufAb0 =

-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,

+                               11, 128, 14, 128, 128, 128, 128, 128};

 // Arrange second value for pixels 0,1,2,3,4,5

-static uvec8 kShufAb1 =

-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,

+                               12, 128, 15, 128, 128, 128, 128, 128};

 // Arrange third value for pixels 0,1,2,3,4,5

-static uvec8 kShufAb2 =

-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,

+                               13, 128, 128, 128, 128, 128, 128, 128};

 // Scaling values for boxes of 3x2 and 2x2

-static uvec16 kScaleAb2 =

-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,

+                                 65536 / 3, 65536 / 2, 0,         0};

 // GCC versions of row functions are verbatim conversions from Visual C.

 // Generated using gcc disassembly on Visual C object file:

 // objdump -D yuvscaler.obj >yuvscaler.txt

-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "psrlw     $0x8,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1"

-  );

+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8_t* dst_ptr,

+                         int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      // 16 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "psrlw     $0x8,%%xmm0                     \n"

+      "psrlw     $0x8,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1");

-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "pcmpeqb    %%xmm4,%%xmm4                  \n"

-    "psrlw      $0xf,%%xmm4                    \n"

-    "packuswb   %%xmm4,%%xmm4                  \n"

-    "pxor       %%xmm5,%%xmm5                  \n"

+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst_ptr,

+                               int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "pcmpeqb    %%xmm4,%%xmm4                  \n"

+      "psrlw      $0xf,%%xmm4                    \n"

+      "packuswb   %%xmm4,%%xmm4                  \n"

+      "pxor       %%xmm5,%%xmm5                  \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pmaddubsw  %%xmm4,%%xmm0                  \n"

-    "pmaddubsw  %%xmm4,%%xmm1                  \n"

-    "pavgw      %%xmm5,%%xmm0                  \n"

-    "pavgw      %%xmm5,%%xmm1                  \n"

-    "packuswb   %%xmm1,%%xmm0                  \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "pmaddubsw  %%xmm4,%%xmm0                  \n"

+      "pmaddubsw  %%xmm4,%%xmm1                  \n"

+      "pavgw      %%xmm5,%%xmm0                  \n"

+      "pavgw      %%xmm5,%%xmm1                  \n"

+      "packuswb   %%xmm1,%%xmm0                  \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");

-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "pcmpeqb    %%xmm4,%%xmm4                  \n"

-    "psrlw      $0xf,%%xmm4                    \n"

-    "packuswb   %%xmm4,%%xmm4                  \n"

-    "pxor       %%xmm5,%%xmm5                  \n"

+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width) {

+  asm volatile(

+      "pcmpeqb    %%xmm4,%%xmm4                  \n"

+      "psrlw      $0xf,%%xmm4                    \n"

+      "packuswb   %%xmm4,%%xmm4                  \n"

+      "pxor       %%xmm5,%%xmm5                  \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2

-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pmaddubsw  %%xmm4,%%xmm0                  \n"

-    "pmaddubsw  %%xmm4,%%xmm1                  \n"

-    "pmaddubsw  %%xmm4,%%xmm2                  \n"

-    "pmaddubsw  %%xmm4,%%xmm3                  \n"

-    "paddw      %%xmm2,%%xmm0                  \n"

-    "paddw      %%xmm3,%%xmm1                  \n"

-    "psrlw      $0x1,%%xmm0                    \n"

-    "psrlw      $0x1,%%xmm1                    \n"

-    "pavgw      %%xmm5,%%xmm0                  \n"

-    "pavgw      %%xmm5,%%xmm1                  \n"

-    "packuswb   %%xmm1,%%xmm0                  \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  : "r"((intptr_t)(src_stride))   // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"

+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"

+      "lea       0x20(%0),%0                     \n"

+      "pmaddubsw  %%xmm4,%%xmm0                  \n"

+      "pmaddubsw  %%xmm4,%%xmm1                  \n"

+      "pmaddubsw  %%xmm4,%%xmm2                  \n"

+      "pmaddubsw  %%xmm4,%%xmm3                  \n"

+      "paddw      %%xmm2,%%xmm0                  \n"

+      "paddw      %%xmm3,%%xmm1                  \n"

+      "psrlw      $0x1,%%xmm0                    \n"

+      "psrlw      $0x1,%%xmm1                    \n"

+      "pavgw      %%xmm5,%%xmm0                  \n"

+      "pavgw      %%xmm5,%%xmm1                  \n"

+      "packuswb   %%xmm1,%%xmm0                  \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),               // %0

+        "+r"(dst_ptr),               // %1

+        "+r"(dst_width)              // %2

+      : "r"((intptr_t)(src_stride))  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #ifdef HAS_SCALEROWDOWN2_AVX2

-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    "lea        " MEMLEA(0x40,0) ",%0          \n"

-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "lea        " MEMLEA(0x20,1) ",%1          \n"

-    "sub        $0x20,%2                       \n"

-    "jg         1b                             \n"

-    "vzeroupper                                \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1"

-  );

+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst_ptr,

+                        int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "lea        0x40(%0),%0                    \n"

+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea        0x20(%1),%1                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1");

-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"

-    "lea        " MEMLEA(0x40,0) ",%0          \n"

-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "lea        " MEMLEA(0x20,1) ",%1          \n"

-    "sub        $0x20,%2                       \n"

-    "jg         1b                             \n"

-    "vzeroupper                                \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "lea        0x40(%0),%0                    \n"

+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea        0x20(%1),%1                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");

-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width) {

+  asm volatile(

+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2

-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3

-    "lea        " MEMLEA(0x40,0) ",%0          \n"

-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

-    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"

-    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"

-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "lea        " MEMLEA(0x20,1) ",%1          \n"

-    "sub        $0x20,%2                       \n"

-    "jg         1b                             \n"

-    "vzeroupper                                \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  : "r"((intptr_t)(src_stride))   // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"

+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"

+      "lea        0x40(%0),%0                    \n"

+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"

+      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"

+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "lea        0x20(%1),%1                    \n"

+      "sub        $0x20,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_ptr),               // %0

+        "+r"(dst_ptr),               // %1

+        "+r"(dst_width)              // %2

+      : "r"((intptr_t)(src_stride))  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_SCALEROWDOWN2_AVX2

-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "pcmpeqb   %%xmm5,%%xmm5                   \n"

-    "psrld     $0x18,%%xmm5                    \n"

-    "pslld     $0x10,%%xmm5                    \n"

+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint8_t* dst_ptr,

+                         int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "pcmpeqb   %%xmm5,%%xmm5                   \n"

+      "psrld     $0x18,%%xmm5                    \n"

+      "pslld     $0x10,%%xmm5                    \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pand      %%xmm5,%%xmm0                   \n"

-    "pand      %%xmm5,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm0                   \n"

-    "psrlw     $0x8,%%xmm0                     \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "pand      %%xmm5,%%xmm0                   \n"

+      "pand      %%xmm5,%%xmm1                   \n"

+      "packuswb  %%xmm1,%%xmm0                   \n"

+      "psrlw     $0x8,%%xmm0                     \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm5");

-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst_ptr,

+                            int dst_width) {

   intptr_t stridex3;

-  asm volatile (

-    "pcmpeqb    %%xmm4,%%xmm4                  \n"

-    "psrlw      $0xf,%%xmm4                    \n"

-    "movdqa     %%xmm4,%%xmm5                  \n"

-    "packuswb   %%xmm4,%%xmm4                  \n"

-    "psllw      $0x3,%%xmm5                    \n"

-    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"

+  asm volatile(

+      "pcmpeqb    %%xmm4,%%xmm4                  \n"

+      "psrlw      $0xf,%%xmm4                    \n"

+      "movdqa     %%xmm4,%%xmm5                  \n"

+      "packuswb   %%xmm4,%%xmm4                  \n"

+      "psllw      $0x3,%%xmm5                    \n"

+      "lea       0x00(%4,%4,2),%3                \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2

-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3

-    "pmaddubsw  %%xmm4,%%xmm0                  \n"

-    "pmaddubsw  %%xmm4,%%xmm1                  \n"

-    "pmaddubsw  %%xmm4,%%xmm2                  \n"

-    "pmaddubsw  %%xmm4,%%xmm3                  \n"

-    "paddw      %%xmm2,%%xmm0                  \n"

-    "paddw      %%xmm3,%%xmm1                  \n"

-    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2

-    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3

-    "pmaddubsw  %%xmm4,%%xmm2                  \n"

-    "pmaddubsw  %%xmm4,%%xmm3                  \n"

-    "paddw      %%xmm2,%%xmm0                  \n"

-    "paddw      %%xmm3,%%xmm1                  \n"

-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2

-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pmaddubsw  %%xmm4,%%xmm2                  \n"

-    "pmaddubsw  %%xmm4,%%xmm3                  \n"

-    "paddw      %%xmm2,%%xmm0                  \n"

-    "paddw      %%xmm3,%%xmm1                  \n"

-    "phaddw     %%xmm1,%%xmm0                  \n"

-    "paddw      %%xmm5,%%xmm0                  \n"

-    "psrlw      $0x4,%%xmm0                    \n"

-    "packuswb   %%xmm0,%%xmm0                  \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x8,1) ",%1            \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width),   // %2

-    "=&r"(stridex3)    // %3

-  : "r"((intptr_t)(src_stride))    // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"

+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"

+      "pmaddubsw  %%xmm4,%%xmm0                  \n"

+      "pmaddubsw  %%xmm4,%%xmm1                  \n"

+      "pmaddubsw  %%xmm4,%%xmm2                  \n"

+      "pmaddubsw  %%xmm4,%%xmm3                  \n"

+      "paddw      %%xmm2,%%xmm0                  \n"

+      "paddw      %%xmm3,%%xmm1                  \n"

+      "movdqu    0x00(%0,%4,2),%%xmm2            \n"

+      "movdqu    0x10(%0,%4,2),%%xmm3            \n"

+      "pmaddubsw  %%xmm4,%%xmm2                  \n"

+      "pmaddubsw  %%xmm4,%%xmm3                  \n"

+      "paddw      %%xmm2,%%xmm0                  \n"

+      "paddw      %%xmm3,%%xmm1                  \n"

+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"

+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"

+      "lea       0x20(%0),%0                     \n"

+      "pmaddubsw  %%xmm4,%%xmm2                  \n"

+      "pmaddubsw  %%xmm4,%%xmm3                  \n"

+      "paddw      %%xmm2,%%xmm0                  \n"

+      "paddw      %%xmm3,%%xmm1                  \n"

+      "phaddw     %%xmm1,%%xmm0                  \n"

+      "paddw      %%xmm5,%%xmm0                  \n"

+      "psrlw      $0x4,%%xmm0                    \n"

+      "packuswb   %%xmm0,%%xmm0                  \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "lea       0x8(%1),%1                      \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),               // %0

+        "+r"(dst_ptr),               // %1

+        "+r"(dst_width),             // %2

+        "=&r"(stridex3)              // %3

+      : "r"((intptr_t)(src_stride))  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #ifdef HAS_SCALEROWDOWN4_AVX2

-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

-    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"

-    "vpslld     $0x10,%%ymm5,%%ymm5            \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    "lea        " MEMLEA(0x40,0) ",%0          \n"

-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"

-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"

-    "lea        " MEMLEA(0x10,1) ",%1          \n"

-    "sub        $0x10,%2                       \n"

-    "jg         1b                             \n"

-    "vzeroupper                                \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"

-  );

+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst_ptr,

+                        int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"

+      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"

+      "vpslld     $0x10,%%ymm5,%%ymm5            \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "lea        0x40(%0),%0                    \n"

+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"

+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"

+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vmovdqu    %%xmm0,(%1)                    \n"

+      "lea        0x10(%1),%1                    \n"

+      "sub        $0x10,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm5");

-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

-    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"

-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width) {

+  asm volatile(

+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"

+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"

+      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"

+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"

-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"

-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2

-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3

-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

-    MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2

-    MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3

-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

-    MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2

-    MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3

-    "lea        " MEMLEA(0x40,0) ",%0          \n"

-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"

-    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"

-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"

-    "lea        " MEMLEA(0x10,1) ",%1          \n"

-    "sub        $0x10,%2                       \n"

-    "jg         1b                             \n"

-    "vzeroupper                                \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  : "r"((intptr_t)(src_stride)),  // %3

-    "r"((intptr_t)(src_stride * 3))   // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm0                    \n"

+      "vmovdqu    0x20(%0),%%ymm1                \n"

+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"

+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"

+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"

+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"

+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"

+      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"

+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"

+      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"

+      "lea        0x40(%0),%0                    \n"

+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"

+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"

+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"

+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"

+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"

+      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"

+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"

+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"

+      "vmovdqu    %%xmm0,(%1)                    \n"

+      "lea        0x10(%1),%1                    \n"

+      "sub        $0x10,%2                       \n"

+      "jg         1b                             \n"

+      "vzeroupper                                \n"

+      : "+r"(src_ptr),                   // %0

+        "+r"(dst_ptr),                   // %1

+        "+r"(dst_width)                  // %2

+      : "r"((intptr_t)(src_stride)),     // %3

+        "r"((intptr_t)(src_stride * 3))  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

 #endif  // HAS_SCALEROWDOWN4_AVX2

-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movdqa    %0,%%xmm3                       \n"

-    "movdqa    %1,%%xmm4                       \n"

-    "movdqa    %2,%%xmm5                       \n"

-  :

-  : "m"(kShuf0),  // %0

-    "m"(kShuf1),  // %1

-    "m"(kShuf2)   // %2

-  );

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "movdqa    %%xmm2,%%xmm1                   \n"

-    "palignr   $0x8,%%xmm0,%%xmm1              \n"

-    "pshufb    %%xmm3,%%xmm0                   \n"

-    "pshufb    %%xmm4,%%xmm1                   \n"

-    "pshufb    %%xmm5,%%xmm2                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"

-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x18,1) ",%1           \n"

-    "sub       $0x18,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),   // %0

-    "+r"(dst_ptr),   // %1

-    "+r"(dst_width)  // %2

-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"

-  );

+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst_ptr,

+                          int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "movdqa    %0,%%xmm3                       \n"

+      "movdqa    %1,%%xmm4                       \n"

+      "movdqa    %2,%%xmm5                       \n"

+      :

+      : "m"(kShuf0),  // %0

+        "m"(kShuf1),  // %1

+        "m"(kShuf2)   // %2

+      );

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm2                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "movdqa    %%xmm2,%%xmm1                   \n"

+      "palignr   $0x8,%%xmm0,%%xmm1              \n"

+      "pshufb    %%xmm3,%%xmm0                   \n"

+      "pshufb    %%xmm4,%%xmm1                   \n"

+      "pshufb    %%xmm5,%%xmm2                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "movq      %%xmm1,0x8(%1)                  \n"

+      "movq      %%xmm2,0x10(%1)                 \n"

+      "lea       0x18(%1),%1                     \n"

+      "sub       $0x18,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");

-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movdqa    %0,%%xmm2                       \n"  // kShuf01

-    "movdqa    %1,%%xmm3                       \n"  // kShuf11

-    "movdqa    %2,%%xmm4                       \n"  // kShuf21

-  :

-  : "m"(kShuf01),  // %0

-    "m"(kShuf11),  // %1

-    "m"(kShuf21)   // %2

-  );

-  asm volatile (

-    "movdqa    %0,%%xmm5                       \n"  // kMadd01

-    "movdqa    %1,%%xmm0                       \n"  // kMadd11

-    "movdqa    %2,%%xmm1                       \n"  // kRound34

-  :

-  : "m"(kMadd01),  // %0

-    "m"(kMadd11),  // %1

-    "m"(kRound34)  // %2

-  );

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"

-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7

-    "pavgb     %%xmm7,%%xmm6                   \n"

-    "pshufb    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm5,%%xmm6                   \n"

-    "paddsw    %%xmm1,%%xmm6                   \n"

-    "psrlw     $0x2,%%xmm6                     \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "movq      %%xmm6," MEMACCESS(1) "         \n"

-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"

-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7

-    "pavgb     %%xmm7,%%xmm6                   \n"

-    "pshufb    %%xmm3,%%xmm6                   \n"

-    "pmaddubsw %%xmm0,%%xmm6                   \n"

-    "paddsw    %%xmm1,%%xmm6                   \n"

-    "psrlw     $0x2,%%xmm6                     \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pavgb     %%xmm7,%%xmm6                   \n"

-    "pshufb    %%xmm4,%%xmm6                   \n"

-    "pmaddubsw %4,%%xmm6                       \n"

-    "paddsw    %%xmm1,%%xmm6                   \n"

-    "psrlw     $0x2,%%xmm6                     \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x18,1) ",%1           \n"

-    "sub       $0x18,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),   // %0

-    "+r"(dst_ptr),   // %1

-    "+r"(dst_width)  // %2

-  : "r"((intptr_t)(src_stride)),  // %3

-    "m"(kMadd21)     // %4

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+                                uint8_t* dst_ptr,

+                                int dst_width) {

+  asm volatile(

+      "movdqa    %0,%%xmm2                       \n"  // kShuf01

+      "movdqa    %1,%%xmm3                       \n"  // kShuf11

+      "movdqa    %2,%%xmm4                       \n"  // kShuf21

+      :

+      : "m"(kShuf01),  // %0

+        "m"(kShuf11),  // %1

+        "m"(kShuf21)   // %2

+      );

+  asm volatile(

+      "movdqa    %0,%%xmm5                       \n"  // kMadd01

+      "movdqa    %1,%%xmm0                       \n"  // kMadd11

+      "movdqa    %2,%%xmm1                       \n"  // kRound34

+      :

+      : "m"(kMadd01),  // %0

+        "m"(kMadd11),  // %1

+        "m"(kRound34)  // %2

+      );

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm6                     \n"

+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "pshufb    %%xmm2,%%xmm6                   \n"

+      "pmaddubsw %%xmm5,%%xmm6                   \n"

+      "paddsw    %%xmm1,%%xmm6                   \n"

+      "psrlw     $0x2,%%xmm6                     \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "movq      %%xmm6,(%1)                     \n"

+      "movdqu    0x8(%0),%%xmm6                  \n"

+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "pshufb    %%xmm3,%%xmm6                   \n"

+      "pmaddubsw %%xmm0,%%xmm6                   \n"

+      "paddsw    %%xmm1,%%xmm6                   \n"

+      "psrlw     $0x2,%%xmm6                     \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "movq      %%xmm6,0x8(%1)                  \n"

+      "movdqu    0x10(%0),%%xmm6                 \n"

+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"

+      "lea       0x20(%0),%0                     \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "pshufb    %%xmm4,%%xmm6                   \n"

+      "pmaddubsw %4,%%xmm6                       \n"

+      "paddsw    %%xmm1,%%xmm6                   \n"

+      "psrlw     $0x2,%%xmm6                     \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "movq      %%xmm6,0x10(%1)                 \n"

+      "lea       0x18(%1),%1                     \n"

+      "sub       $0x18,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),                // %0

+        "+r"(dst_ptr),                // %1

+        "+r"(dst_width)               // %2

+      : "r"((intptr_t)(src_stride)),  // %3

+        "m"(kMadd21)                  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movdqa    %0,%%xmm2                       \n"  // kShuf01

-    "movdqa    %1,%%xmm3                       \n"  // kShuf11

-    "movdqa    %2,%%xmm4                       \n"  // kShuf21

-  :

-  : "m"(kShuf01),  // %0

-    "m"(kShuf11),  // %1

-    "m"(kShuf21)   // %2

-  );

-  asm volatile (

-    "movdqa    %0,%%xmm5                       \n"  // kMadd01

-    "movdqa    %1,%%xmm0                       \n"  // kMadd11

-    "movdqa    %2,%%xmm1                       \n"  // kRound34

-  :

-  : "m"(kMadd01),  // %0

-    "m"(kMadd11),  // %1

-    "m"(kRound34)  // %2

-  );

+                                uint8_t* dst_ptr,

+                                int dst_width) {

+  asm volatile(

+      "movdqa    %0,%%xmm2                       \n"  // kShuf01

+      "movdqa    %1,%%xmm3                       \n"  // kShuf11

+      "movdqa    %2,%%xmm4                       \n"  // kShuf21

+      :

+      : "m"(kShuf01),  // %0

+        "m"(kShuf11),  // %1

+        "m"(kShuf21)   // %2

+      );

+  asm volatile(

+      "movdqa    %0,%%xmm5                       \n"  // kMadd01

+      "movdqa    %1,%%xmm0                       \n"  // kMadd11

+      "movdqa    %2,%%xmm1                       \n"  // kRound34

+      :

+      : "m"(kMadd01),  // %0

+        "m"(kMadd11),  // %1

+        "m"(kRound34)  // %2

+      );

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"

-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7

-    "pavgb     %%xmm6,%%xmm7                   \n"

-    "pavgb     %%xmm7,%%xmm6                   \n"

-    "pshufb    %%xmm2,%%xmm6                   \n"

-    "pmaddubsw %%xmm5,%%xmm6                   \n"

-    "paddsw    %%xmm1,%%xmm6                   \n"

-    "psrlw     $0x2,%%xmm6                     \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "movq      %%xmm6," MEMACCESS(1) "         \n"

-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"

-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7

-    "pavgb     %%xmm6,%%xmm7                   \n"

-    "pavgb     %%xmm7,%%xmm6                   \n"

-    "pshufb    %%xmm3,%%xmm6                   \n"

-    "pmaddubsw %%xmm0,%%xmm6                   \n"

-    "paddsw    %%xmm1,%%xmm6                   \n"

-    "psrlw     $0x2,%%xmm6                     \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"

-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pavgb     %%xmm6,%%xmm7                   \n"

-    "pavgb     %%xmm7,%%xmm6                   \n"

-    "pshufb    %%xmm4,%%xmm6                   \n"

-    "pmaddubsw %4,%%xmm6                       \n"

-    "paddsw    %%xmm1,%%xmm6                   \n"

-    "psrlw     $0x2,%%xmm6                     \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x18,1) ",%1           \n"

-    "sub       $0x18,%2                        \n"

-    "jg        1b                              \n"

-    : "+r"(src_ptr),   // %0

-      "+r"(dst_ptr),   // %1

-      "+r"(dst_width)  // %2

-    : "r"((intptr_t)(src_stride)),  // %3

-      "m"(kMadd21)     // %4

-    : "memory", "cc", NACL_R14

-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm6                     \n"

+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"

+      "pavgb     %%xmm6,%%xmm7                   \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "pshufb    %%xmm2,%%xmm6                   \n"

+      "pmaddubsw %%xmm5,%%xmm6                   \n"

+      "paddsw    %%xmm1,%%xmm6                   \n"

+      "psrlw     $0x2,%%xmm6                     \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "movq      %%xmm6,(%1)                     \n"

+      "movdqu    0x8(%0),%%xmm6                  \n"

+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"

+      "pavgb     %%xmm6,%%xmm7                   \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "pshufb    %%xmm3,%%xmm6                   \n"

+      "pmaddubsw %%xmm0,%%xmm6                   \n"

+      "paddsw    %%xmm1,%%xmm6                   \n"

+      "psrlw     $0x2,%%xmm6                     \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "movq      %%xmm6,0x8(%1)                  \n"

+      "movdqu    0x10(%0),%%xmm6                 \n"

+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"

+      "lea       0x20(%0),%0                     \n"

+      "pavgb     %%xmm6,%%xmm7                   \n"

+      "pavgb     %%xmm7,%%xmm6                   \n"

+      "pshufb    %%xmm4,%%xmm6                   \n"

+      "pmaddubsw %4,%%xmm6                       \n"

+      "paddsw    %%xmm1,%%xmm6                   \n"

+      "psrlw     $0x2,%%xmm6                     \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "movq      %%xmm6,0x10(%1)                 \n"

+      "lea       0x18(%1),%1                     \n"

+      "sub       $0x18,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),                // %0

+        "+r"(dst_ptr),                // %1

+        "+r"(dst_width)               // %2

+      : "r"((intptr_t)(src_stride)),  // %3

+        "m"(kMadd21)                  // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movdqa    %3,%%xmm4                       \n"

-    "movdqa    %4,%%xmm5                       \n"

+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst_ptr,

+                          int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "movdqa    %3,%%xmm4                       \n"

+      "movdqa    %4,%%xmm5                       \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pshufb    %%xmm4,%%xmm0                   \n"

-    "pshufb    %%xmm5,%%xmm1                   \n"

-    "paddusb   %%xmm1,%%xmm0                   \n"

-    "movq      %%xmm0," MEMACCESS(1) "         \n"

-    "movhlps   %%xmm0,%%xmm1                   \n"

-    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"

-    "lea       " MEMLEA(0xc,1) ",%1            \n"

-    "sub       $0xc,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),   // %0

-    "+r"(dst_ptr),   // %1

-    "+r"(dst_width)  // %2

-  : "m"(kShuf38a),   // %3

-    "m"(kShuf38b)    // %4

-  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "pshufb    %%xmm4,%%xmm0                   \n"

+      "pshufb    %%xmm5,%%xmm1                   \n"

+      "paddusb   %%xmm1,%%xmm0                   \n"

+      "movq      %%xmm0,(%1)                     \n"

+      "movhlps   %%xmm0,%%xmm1                   \n"

+      "movd      %%xmm1,0x8(%1)                  \n"

+      "lea       0xc(%1),%1                      \n"

+      "sub       $0xc,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+      : "m"(kShuf38a),   // %3

+        "m"(kShuf38b)    // %4

+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");

-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movdqa    %0,%%xmm2                       \n"

-    "movdqa    %1,%%xmm3                       \n"

-    "movdqa    %2,%%xmm4                       \n"

-    "movdqa    %3,%%xmm5                       \n"

-  :

-  : "m"(kShufAb0),   // %0

-    "m"(kShufAb1),   // %1

-    "m"(kShufAb2),   // %2

-    "m"(kScaleAb2)   // %3

-  );

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "pavgb     %%xmm1,%%xmm0                   \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "pshufb    %%xmm2,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm6                   \n"

-    "pshufb    %%xmm3,%%xmm6                   \n"

-    "paddusw   %%xmm6,%%xmm1                   \n"

-    "pshufb    %%xmm4,%%xmm0                   \n"

-    "paddusw   %%xmm0,%%xmm1                   \n"

-    "pmulhuw   %%xmm5,%%xmm1                   \n"

-    "packuswb  %%xmm1,%%xmm1                   \n"

-    "movd      %%xmm1," MEMACCESS(1) "         \n"

-    "psrlq     $0x10,%%xmm1                    \n"

-    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"

-    "lea       " MEMLEA(0x6,1) ",%1            \n"

-    "sub       $0x6,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(dst_width)    // %2

-  : "r"((intptr_t)(src_stride))  // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+                                uint8_t* dst_ptr,

+                                int dst_width) {

+  asm volatile(

+      "movdqa    %0,%%xmm2                       \n"

+      "movdqa    %1,%%xmm3                       \n"

+      "movdqa    %2,%%xmm4                       \n"

+      "movdqa    %3,%%xmm5                       \n"

+      :

+      : "m"(kShufAb0),  // %0

+        "m"(kShufAb1),  // %1

+        "m"(kShufAb2),  // %2

+        "m"(kScaleAb2)  // %3

+      );

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%3,1),%%xmm1            \n"

+      "lea       0x10(%0),%0                     \n"

+      "pavgb     %%xmm1,%%xmm0                   \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "pshufb    %%xmm2,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm6                   \n"

+      "pshufb    %%xmm3,%%xmm6                   \n"

+      "paddusw   %%xmm6,%%xmm1                   \n"

+      "pshufb    %%xmm4,%%xmm0                   \n"

+      "paddusw   %%xmm0,%%xmm1                   \n"

+      "pmulhuw   %%xmm5,%%xmm1                   \n"

+      "packuswb  %%xmm1,%%xmm1                   \n"

+      "movd      %%xmm1,(%1)                     \n"

+      "psrlq     $0x10,%%xmm1                    \n"

+      "movd      %%xmm1,0x2(%1)                  \n"

+      "lea       0x6(%1),%1                      \n"

+      "sub       $0x6,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),               // %0

+        "+r"(dst_ptr),               // %1

+        "+r"(dst_width)              // %2

+      : "r"((intptr_t)(src_stride))  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,

                                 ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movdqa    %0,%%xmm2                       \n"

-    "movdqa    %1,%%xmm3                       \n"

-    "movdqa    %2,%%xmm4                       \n"

-    "pxor      %%xmm5,%%xmm5                   \n"

-  :

-  : "m"(kShufAc),    // %0

-    "m"(kShufAc3),   // %1

-    "m"(kScaleAc33)  // %2

-  );

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6

-    "movhlps   %%xmm0,%%xmm1                   \n"

-    "movhlps   %%xmm6,%%xmm7                   \n"

-    "punpcklbw %%xmm5,%%xmm0                   \n"

-    "punpcklbw %%xmm5,%%xmm1                   \n"

-    "punpcklbw %%xmm5,%%xmm6                   \n"

-    "punpcklbw %%xmm5,%%xmm7                   \n"

-    "paddusw   %%xmm6,%%xmm0                   \n"

-    "paddusw   %%xmm7,%%xmm1                   \n"

-    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6

-    "lea       " MEMLEA(0x10,0) ",%0           \n"

-    "movhlps   %%xmm6,%%xmm7                   \n"

-    "punpcklbw %%xmm5,%%xmm6                   \n"

-    "punpcklbw %%xmm5,%%xmm7                   \n"

-    "paddusw   %%xmm6,%%xmm0                   \n"

-    "paddusw   %%xmm7,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm6                   \n"

-    "psrldq    $0x2,%%xmm0                     \n"

-    "paddusw   %%xmm0,%%xmm6                   \n"

-    "psrldq    $0x2,%%xmm0                     \n"

-    "paddusw   %%xmm0,%%xmm6                   \n"

-    "pshufb    %%xmm2,%%xmm6                   \n"

-    "movdqa    %%xmm1,%%xmm7                   \n"

-    "psrldq    $0x2,%%xmm1                     \n"

-    "paddusw   %%xmm1,%%xmm7                   \n"

-    "psrldq    $0x2,%%xmm1                     \n"

-    "paddusw   %%xmm1,%%xmm7                   \n"

-    "pshufb    %%xmm3,%%xmm7                   \n"

-    "paddusw   %%xmm7,%%xmm6                   \n"

-    "pmulhuw   %%xmm4,%%xmm6                   \n"

-    "packuswb  %%xmm6,%%xmm6                   \n"

-    "movd      %%xmm6," MEMACCESS(1) "         \n"

-    "psrlq     $0x10,%%xmm6                    \n"

-    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"

-    "lea       " MEMLEA(0x6,1) ",%1            \n"

-    "sub       $0x6,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),    // %0

-    "+r"(dst_ptr),    // %1

-    "+r"(dst_width)   // %2

-  : "r"((intptr_t)(src_stride))   // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+                                uint8_t* dst_ptr,

+                                int dst_width) {

+  asm volatile(

+      "movdqa    %0,%%xmm2                       \n"

+      "movdqa    %1,%%xmm3                       \n"

+      "movdqa    %2,%%xmm4                       \n"

+      "pxor      %%xmm5,%%xmm5                   \n"

+      :

+      : "m"(kShufAc),    // %0

+        "m"(kShufAc3),   // %1

+        "m"(kScaleAc33)  // %2

+      );

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x00(%0,%3,1),%%xmm6            \n"

+      "movhlps   %%xmm0,%%xmm1                   \n"

+      "movhlps   %%xmm6,%%xmm7                   \n"

+      "punpcklbw %%xmm5,%%xmm0                   \n"

+      "punpcklbw %%xmm5,%%xmm1                   \n"

+      "punpcklbw %%xmm5,%%xmm6                   \n"

+      "punpcklbw %%xmm5,%%xmm7                   \n"

+      "paddusw   %%xmm6,%%xmm0                   \n"

+      "paddusw   %%xmm7,%%xmm1                   \n"

+      "movdqu    0x00(%0,%3,2),%%xmm6            \n"

+      "lea       0x10(%0),%0                     \n"

+      "movhlps   %%xmm6,%%xmm7                   \n"

+      "punpcklbw %%xmm5,%%xmm6                   \n"

+      "punpcklbw %%xmm5,%%xmm7                   \n"

+      "paddusw   %%xmm6,%%xmm0                   \n"

+      "paddusw   %%xmm7,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm6                   \n"

+      "psrldq    $0x2,%%xmm0                     \n"

+      "paddusw   %%xmm0,%%xmm6                   \n"

+      "psrldq    $0x2,%%xmm0                     \n"

+      "paddusw   %%xmm0,%%xmm6                   \n"

+      "pshufb    %%xmm2,%%xmm6                   \n"

+      "movdqa    %%xmm1,%%xmm7                   \n"

+      "psrldq    $0x2,%%xmm1                     \n"

+      "paddusw   %%xmm1,%%xmm7                   \n"

+      "psrldq    $0x2,%%xmm1                     \n"

+      "paddusw   %%xmm1,%%xmm7                   \n"

+      "pshufb    %%xmm3,%%xmm7                   \n"

+      "paddusw   %%xmm7,%%xmm6                   \n"

+      "pmulhuw   %%xmm4,%%xmm6                   \n"

+      "packuswb  %%xmm6,%%xmm6                   \n"

+      "movd      %%xmm6,(%1)                     \n"

+      "psrlq     $0x10,%%xmm6                    \n"

+      "movd      %%xmm6,0x2(%1)                  \n"

+      "lea       0x6(%1),%1                      \n"

+      "sub       $0x6,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),               // %0

+        "+r"(dst_ptr),               // %1

+        "+r"(dst_width)              // %2

+      : "r"((intptr_t)(src_stride))  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 // Reads 16xN bytes and produces 16 shorts at a time.

-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

-  asm volatile (

-    "pxor      %%xmm5,%%xmm5                   \n"

+void ScaleAddRow_SSE2(const uint8_t* src_ptr,

+                      uint16_t* dst_ptr,

+                      int src_width) {

+  asm volatile(

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"

-    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"

-    "movdqa    %%xmm3,%%xmm2                   \n"

-    "punpcklbw %%xmm5,%%xmm2                   \n"

-    "punpckhbw %%xmm5,%%xmm3                   \n"

-    "paddusw   %%xmm2,%%xmm0                   \n"

-    "paddusw   %%xmm3,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"

-    "lea       " MEMLEA(0x20,1) ",%1           \n"

-    "sub       $0x10,%2                        \n"

-    "jg        1b                              \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(src_width)    // %2

-  :

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      "pxor      %%xmm5,%%xmm5                   \n"

+      // 16 pixel loop.

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm3                     \n"

+      "lea       0x10(%0),%0                     \n"  // src_ptr += 16

+      "movdqu    (%1),%%xmm0                     \n"

+      "movdqu    0x10(%1),%%xmm1                 \n"

+      "movdqa    %%xmm3,%%xmm2                   \n"

+      "punpcklbw %%xmm5,%%xmm2                   \n"

+      "punpckhbw %%xmm5,%%xmm3                   \n"

+      "paddusw   %%xmm2,%%xmm0                   \n"

+      "paddusw   %%xmm3,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "movdqu    %%xmm1,0x10(%1)                 \n"

+      "lea       0x20(%1),%1                     \n"

+      "sub       $0x10,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(src_width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #ifdef HAS_SCALEADDROW_AVX2

 // Reads 32 bytes and accumulates to 32 shorts at a time.

-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

-  asm volatile (

-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+void ScaleAddRow_AVX2(const uint8_t* src_ptr,

+                      uint16_t* dst_ptr,

+                      int src_width) {

+  asm volatile(

-    LABELALIGN

-  "1:                                          \n"

-    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"

-    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32

-    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"

-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"

-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"

-    "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"

-    "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"

-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"

-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"

-    "lea       " MEMLEA(0x40,1) ",%1           \n"

-    "sub       $0x20,%2                        \n"

-    "jg        1b                              \n"

-    "vzeroupper                                \n"

-  : "+r"(src_ptr),     // %0

-    "+r"(dst_ptr),     // %1

-    "+r"(src_width)    // %2

-  :

-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"

-  );

+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"

+      LABELALIGN

+      "1:                                        \n"

+      "vmovdqu    (%0),%%ymm3                    \n"

+      "lea        0x20(%0),%0                    \n"  // src_ptr += 32

+      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"

+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"

+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"

+      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"

+      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"

+      "vmovdqu    %%ymm0,(%1)                    \n"

+      "vmovdqu    %%ymm1,0x20(%1)                \n"

+      "lea       0x40(%1),%1                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      "vzeroupper                                \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(src_width)  // %2

+      :

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");

 #endif  // HAS_SCALEADDROW_AVX2

 // Constant for making pixels signed to avoid pmaddubsw

 // saturation.

-static uvec8 kFsub80 =

-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };

+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};

 // Constant for making pixels unsigned and adding .5 for rounding.

-static uvec16 kFadd40 =

-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };

+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,

+                               0x4040, 0x4040, 0x4040, 0x4040};

 // Bilinear column filtering. SSSE3 version.

-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                           int dst_width, int x, int dx) {

+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,

+                           const uint8_t* src_ptr,

+                           int dst_width,

+                           int x,

+                           int dx) {

   intptr_t x0, x1, temp_pixel;

-  asm volatile (

-    "movd      %6,%%xmm2                       \n"

-    "movd      %7,%%xmm3                       \n"

-    "movl      $0x04040000,%k2                 \n"

-    "movd      %k2,%%xmm5                      \n"

-    "pcmpeqb   %%xmm6,%%xmm6                   \n"

-    "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f

-    "pcmpeqb   %%xmm7,%%xmm7                   \n"

-    "psrlw     $15,%%xmm7                      \n"  // 0x00010001

+  asm volatile(

+      "movd      %6,%%xmm2                       \n"

+      "movd      %7,%%xmm3                       \n"

+      "movl      $0x04040000,%k2                 \n"

+      "movd      %k2,%%xmm5                      \n"

+      "pcmpeqb   %%xmm6,%%xmm6                   \n"

+      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f

+      "pcmpeqb   %%xmm7,%%xmm7                   \n"

+      "psrlw     $15,%%xmm7                      \n"  // 0x00010001

-    "pextrw    $0x1,%%xmm2,%k3                 \n"

-    "subl      $0x2,%5                         \n"

-    "jl        29f                             \n"

-    "movdqa    %%xmm2,%%xmm0                   \n"

-    "paddd     %%xmm3,%%xmm0                   \n"

-    "punpckldq %%xmm0,%%xmm2                   \n"

-    "punpckldq %%xmm3,%%xmm3                   \n"

-    "paddd     %%xmm3,%%xmm3                   \n"

-    "pextrw    $0x3,%%xmm2,%k4                 \n"

+      "pextrw    $0x1,%%xmm2,%k3                 \n"

+      "subl      $0x2,%5                         \n"

+      "jl        29f                             \n"

+      "movdqa    %%xmm2,%%xmm0                   \n"

+      "paddd     %%xmm3,%%xmm0                   \n"

+      "punpckldq %%xmm0,%%xmm2                   \n"

+      "punpckldq %%xmm3,%%xmm3                   \n"

+      "paddd     %%xmm3,%%xmm3                   \n"

+      "pextrw    $0x3,%%xmm2,%k4                 \n"

-    LABELALIGN

-  "2:                                          \n"

-    "movdqa    %%xmm2,%%xmm1                   \n"

-    "paddd     %%xmm3,%%xmm2                   \n"

-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2

-    "movd      %k2,%%xmm0                      \n"

-    "psrlw     $0x9,%%xmm1                     \n"

-    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2

-    "movd      %k2,%%xmm4                      \n"

-    "pshufb    %%xmm5,%%xmm1                   \n"

-    "punpcklwd %%xmm4,%%xmm0                   \n"

-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.

-    "pxor      %%xmm6,%%xmm1                   \n"  // 128 -f = (f ^ 127 ) + 1

-    "paddusb   %%xmm7,%%xmm1                   \n"

-    "pmaddubsw %%xmm0,%%xmm1                   \n"

-    "pextrw    $0x1,%%xmm2,%k3                 \n"

-    "pextrw    $0x3,%%xmm2,%k4                 \n"

-    "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.

-    "psrlw     $0x7,%%xmm1                     \n"

-    "packuswb  %%xmm1,%%xmm1                   \n"

-    "movd      %%xmm1,%k2                      \n"

-    "mov       %w2," MEMACCESS(0) "            \n"

-    "lea       " MEMLEA(0x2,0) ",%0            \n"

-    "subl      $0x2,%5                         \n"

-    "jge       2b                              \n"

+      LABELALIGN

+      "2:                                        \n"

+      "movdqa    %%xmm2,%%xmm1                   \n"

+      "paddd     %%xmm3,%%xmm2                   \n"

+      "movzwl    0x00(%1,%3,1),%k2               \n"

+      "movd      %k2,%%xmm0                      \n"

+      "psrlw     $0x9,%%xmm1                     \n"

+      "movzwl    0x00(%1,%4,1),%k2               \n"

+      "movd      %k2,%%xmm4                      \n"

+      "pshufb    %%xmm5,%%xmm1                   \n"

+      "punpcklwd %%xmm4,%%xmm0                   \n"

+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.

+      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +

+                                                      // 1

+      "paddusb   %%xmm7,%%xmm1                   \n"

+      "pmaddubsw %%xmm0,%%xmm1                   \n"

+      "pextrw    $0x1,%%xmm2,%k3                 \n"

+      "pextrw    $0x3,%%xmm2,%k4                 \n"

+      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.

+      "psrlw     $0x7,%%xmm1                     \n"

+      "packuswb  %%xmm1,%%xmm1                   \n"

+      "movd      %%xmm1,%k2                      \n"

+      "mov       %w2,(%0)                        \n"

+      "lea       0x2(%0),%0                      \n"

+      "subl      $0x2,%5                         \n"

+      "jge       2b                              \n"

-    LABELALIGN

-  "29:                                         \n"

-    "addl      $0x1,%5                         \n"

-    "jl        99f                             \n"

-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2

-    "movd      %k2,%%xmm0                      \n"

-    "psrlw     $0x9,%%xmm2                     \n"

-    "pshufb    %%xmm5,%%xmm2                   \n"

-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.

-    "pxor      %%xmm6,%%xmm2                   \n"

-    "paddusb   %%xmm7,%%xmm2                   \n"

-    "pmaddubsw %%xmm0,%%xmm2                   \n"

-    "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.

-    "psrlw     $0x7,%%xmm2                     \n"

-    "packuswb  %%xmm2,%%xmm2                   \n"

-    "movd      %%xmm2,%k2                      \n"

-    "mov       %b2," MEMACCESS(0) "            \n"

-  "99:                                         \n"

-  : "+r"(dst_ptr),      // %0

-    "+r"(src_ptr),      // %1

-    "=&a"(temp_pixel),  // %2

-    "=&r"(x0),          // %3

-    "=&r"(x1),          // %4

+      LABELALIGN

+      "29:                                       \n"

+      "addl      $0x1,%5                         \n"

+      "jl        99f                             \n"

+      "movzwl    0x00(%1,%3,1),%k2               \n"

+      "movd      %k2,%%xmm0                      \n"

+      "psrlw     $0x9,%%xmm2                     \n"

+      "pshufb    %%xmm5,%%xmm2                   \n"

+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.

+      "pxor      %%xmm6,%%xmm2                   \n"

+      "paddusb   %%xmm7,%%xmm2                   \n"

+      "pmaddubsw %%xmm0,%%xmm2                   \n"

+      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.

+      "psrlw     $0x7,%%xmm2                     \n"

+      "packuswb  %%xmm2,%%xmm2                   \n"

+      "movd      %%xmm2,%k2                      \n"

+      "mov       %b2,(%0)                        \n"

+      "99:                                       \n"

+      : "+r"(dst_ptr),      // %0

+        "+r"(src_ptr),      // %1

+        "=&a"(temp_pixel),  // %2

+        "=&r"(x0),          // %3

+        "=&r"(x1),          // %4

 #if defined(__x86_64__)

-    "+rm"(dst_width)    // %5

+        "+rm"(dst_width)  // %5

 #else

-    "+m"(dst_width)    // %5

+        "+m"(dst_width)  // %5

 #endif

-  : "rm"(x),            // %6

-    "rm"(dx),           // %7

+      : "rm"(x),   // %6

+        "rm"(dx),  // %7

 #if defined(__x86_64__)

-    "x"(kFsub80),       // %8

-    "x"(kFadd40)        // %9

+        "x"(kFsub80),  // %8

+        "x"(kFadd40)   // %9

 #else

-    "m"(kFsub80),       // %8

-    "m"(kFadd40)        // %9

+        "m"(kFsub80),    // %8

+        "m"(kFadd40)     // %9

 #endif

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"

-  );

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",

+        "xmm7");

 // Reads 4 pixels, duplicates them and writes 8 pixels.

 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                       int dst_width, int x, int dx) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpcklbw %%xmm0,%%xmm0                   \n"

-    "punpckhbw %%xmm1,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "sub       $0x20,%2                         \n"

-    "jg        1b                              \n"

+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,

+                       const uint8_t* src_ptr,

+                       int dst_width,

+                       int x,

+                       int dx) {

+  (void)x;

+  (void)dx;

+  asm volatile(

-  : "+r"(dst_ptr),     // %0

-    "+r"(src_ptr),     // %1

-    "+r"(dst_width)    // %2

-  :: "memory", "cc", "xmm0", "xmm1"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%1),%%xmm0                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpcklbw %%xmm0,%%xmm0                   \n"

+      "punpckhbw %%xmm1,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%0)                     \n"

+      "movdqu    %%xmm1,0x10(%0)                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "sub       $0x20,%2                        \n"

+      "jg        1b                              \n"

+      : "+r"(dst_ptr),   // %0

+        "+r"(src_ptr),   // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1");

-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,

                             ptrdiff_t src_stride,

-                            uint8* dst_argb, int dst_width) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "shufps    $0xdd,%%xmm1,%%xmm0             \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(dst_width)  // %2

-  :: "memory", "cc", "xmm0", "xmm1"

-  );

+                            uint8_t* dst_argb,

+                            int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "shufps    $0xdd,%%xmm1,%%xmm0             \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1");

-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,

                                   ptrdiff_t src_stride,

-                                  uint8* dst_argb, int dst_width) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),  // %0

-    "+r"(dst_argb),  // %1

-    "+r"(dst_width)  // %2

-  :: "memory", "cc", "xmm0", "xmm1"

-  );

+                                  uint8_t* dst_argb,

+                                  int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"

+      "pavgb     %%xmm2,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1");

-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,

                                ptrdiff_t src_stride,

-                               uint8* dst_argb, int dst_width) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"

-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"

-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2

-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "pavgb     %%xmm3,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "sub       $0x4,%2                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),   // %0

-    "+r"(dst_argb),   // %1

-    "+r"(dst_width)   // %2

-  : "r"((intptr_t)(src_stride))   // %3

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3"

-  );

+                               uint8_t* dst_argb,

+                               int dst_width) {

+  asm volatile(

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%0),%%xmm0                     \n"

+      "movdqu    0x10(%0),%%xmm1                 \n"

+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"

+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"

+      "lea       0x20(%0),%0                     \n"

+      "pavgb     %%xmm2,%%xmm0                   \n"

+      "pavgb     %%xmm3,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"

+      "pavgb     %%xmm2,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%1)                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "sub       $0x4,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),              // %0

+        "+r"(dst_argb),              // %1

+        "+r"(dst_width)              // %2

+      : "r"((intptr_t)(src_stride))  // %3

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");

 // Reads 4 pixels at a time.

 // Alignment requirement: dst_argb 16 byte aligned.

-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                               int src_stepx, uint8* dst_argb, int dst_width) {

+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,

+                               ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8_t* dst_argb,

+                               int dst_width) {

   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

   intptr_t src_stepx_x12;

-  asm volatile (

-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movd      " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1

-    "punpckldq %%xmm1,%%xmm0                   \n"

-    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2

-    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3

-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"

-    "punpckldq %%xmm3,%%xmm2                   \n"

-    "punpcklqdq %%xmm2,%%xmm0                  \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),       // %0

-    "+r"(src_stepx_x4),   // %1

-    "+r"(dst_argb),       // %2

-    "+r"(dst_width),      // %3

-    "=&r"(src_stepx_x12)  // %4

-  :: "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3"

-  );

+  (void)src_stride;

+  asm volatile(

+      "lea       0x00(,%1,4),%1                  \n"

+      "lea       0x00(%1,%1,2),%4                \n"

+      LABELALIGN

+      "1:                                        \n"

+      "movd      (%0),%%xmm0                     \n"

+      "movd      0x00(%0,%1,1),%%xmm1            \n"

+      "punpckldq %%xmm1,%%xmm0                   \n"

+      "movd      0x00(%0,%1,2),%%xmm2            \n"

+      "movd      0x00(%0,%4,1),%%xmm3            \n"

+      "lea       0x00(%0,%1,4),%0                \n"

+      "punpckldq %%xmm3,%%xmm2                   \n"

+      "punpcklqdq %%xmm2,%%xmm0                  \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),       // %0

+        "+r"(src_stepx_x4),   // %1

+        "+r"(dst_argb),       // %2

+        "+r"(dst_width),      // %3

+        "=&r"(src_stepx_x12)  // %4

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");

 // Blends four 2x2 to 4x1.

 // Alignment requirement: dst_argb 16 byte aligned.

-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

-                                  ptrdiff_t src_stride, int src_stepx,

-                                  uint8* dst_argb, int dst_width) {

+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

+                                  int src_stepx,

+                                  uint8_t* dst_argb,

+                                  int dst_width) {

   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);

   intptr_t src_stepx_x12;

   intptr_t row1 = (intptr_t)(src_stride);

-  asm volatile (

-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"

-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"

-    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"

+  asm volatile(

+      "lea       0x00(,%1,4),%1                  \n"

+      "lea       0x00(%1,%1,2),%4                \n"

+      "lea       0x00(%0,%5,1),%5                \n"

-    LABELALIGN

-  "1:                                          \n"

-    "movq      " MEMACCESS(0) ",%%xmm0         \n"

-    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0

-    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1

-    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1

-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"

-    "movq      " MEMACCESS(5) ",%%xmm2         \n"

-    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2

-    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3

-    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3

-    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "pavgb     %%xmm3,%%xmm1                   \n"

-    "movdqa    %%xmm0,%%xmm2                   \n"

-    "shufps    $0x88,%%xmm1,%%xmm0             \n"

-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"

-    "pavgb     %%xmm2,%%xmm0                   \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%3                         \n"

-    "jg        1b                              \n"

-  : "+r"(src_argb),        // %0

-    "+r"(src_stepx_x4),    // %1

-    "+r"(dst_argb),        // %2

-    "+rm"(dst_width),      // %3

-    "=&r"(src_stepx_x12),  // %4

-    "+r"(row1)             // %5

-  :: "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movq      (%0),%%xmm0                     \n"

+      "movhps    0x00(%0,%1,1),%%xmm0            \n"

+      "movq      0x00(%0,%1,2),%%xmm1            \n"

+      "movhps    0x00(%0,%4,1),%%xmm1            \n"

+      "lea       0x00(%0,%1,4),%0                \n"

+      "movq      (%5),%%xmm2                     \n"

+      "movhps    0x00(%5,%1,1),%%xmm2            \n"

+      "movq      0x00(%5,%1,2),%%xmm3            \n"

+      "movhps    0x00(%5,%4,1),%%xmm3            \n"

+      "lea       0x00(%5,%1,4),%5                \n"

+      "pavgb     %%xmm2,%%xmm0                   \n"

+      "pavgb     %%xmm3,%%xmm1                   \n"

+      "movdqa    %%xmm0,%%xmm2                   \n"

+      "shufps    $0x88,%%xmm1,%%xmm0             \n"

+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"

+      "pavgb     %%xmm2,%%xmm0                   \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%3                         \n"

+      "jg        1b                              \n"

+      : "+r"(src_argb),        // %0

+        "+r"(src_stepx_x4),    // %1

+        "+r"(dst_argb),        // %2

+        "+rm"(dst_width),      // %3

+        "=&r"(src_stepx_x12),  // %4

+        "+r"(row1)             // %5

+        ::"memory",

+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");

-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int x, int dx) {

+void ScaleARGBCols_SSE2(uint8_t* dst_argb,

+                        const uint8_t* src_argb,

+                        int dst_width,

+                        int x,

+                        int dx) {

   intptr_t x0, x1;

-  asm volatile (

-    "movd      %5,%%xmm2                       \n"

-    "movd      %6,%%xmm3                       \n"

-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"

-    "pshufd    $0x11,%%xmm3,%%xmm0             \n"

-    "paddd     %%xmm0,%%xmm2                   \n"

-    "paddd     %%xmm3,%%xmm3                   \n"

-    "pshufd    $0x5,%%xmm3,%%xmm0              \n"

-    "paddd     %%xmm0,%%xmm2                   \n"

-    "paddd     %%xmm3,%%xmm3                   \n"

-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"

-    "pextrw    $0x1,%%xmm2,%k0                 \n"

-    "pextrw    $0x3,%%xmm2,%k1                 \n"

-    "cmp       $0x0,%4                         \n"

-    "jl        99f                             \n"

-    "sub       $0x4,%4                         \n"

-    "jl        49f                             \n"

+  asm volatile(

+      "movd      %5,%%xmm2                       \n"

+      "movd      %6,%%xmm3                       \n"

+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"

+      "pshufd    $0x11,%%xmm3,%%xmm0             \n"

+      "paddd     %%xmm0,%%xmm2                   \n"

+      "paddd     %%xmm3,%%xmm3                   \n"

+      "pshufd    $0x5,%%xmm3,%%xmm0              \n"

+      "paddd     %%xmm0,%%xmm2                   \n"

+      "paddd     %%xmm3,%%xmm3                   \n"

+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"

+      "pextrw    $0x1,%%xmm2,%k0                 \n"

+      "pextrw    $0x3,%%xmm2,%k1                 \n"

+      "cmp       $0x0,%4                         \n"

+      "jl        99f                             \n"

+      "sub       $0x4,%4                         \n"

+      "jl        49f                             \n"

-    LABELALIGN

-  "40:                                         \n"

-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1

-    "pextrw    $0x5,%%xmm2,%k0                 \n"

-    "pextrw    $0x7,%%xmm2,%k1                 \n"

-    "paddd     %%xmm3,%%xmm2                   \n"

-    "punpckldq %%xmm1,%%xmm0                   \n"

-    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1

-    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4

-    "pextrw    $0x1,%%xmm2,%k0                 \n"

-    "pextrw    $0x3,%%xmm2,%k1                 \n"

-    "punpckldq %%xmm4,%%xmm1                   \n"

-    "punpcklqdq %%xmm1,%%xmm0                  \n"

-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x10,2) ",%2           \n"

-    "sub       $0x4,%4                         \n"

-    "jge       40b                             \n"

+      LABELALIGN

+      "40:                                       \n"

+      "movd      0x00(%3,%0,4),%%xmm0            \n"

+      "movd      0x00(%3,%1,4),%%xmm1            \n"

+      "pextrw    $0x5,%%xmm2,%k0                 \n"

+      "pextrw    $0x7,%%xmm2,%k1                 \n"

+      "paddd     %%xmm3,%%xmm2                   \n"

+      "punpckldq %%xmm1,%%xmm0                   \n"

+      "movd      0x00(%3,%0,4),%%xmm1            \n"

+      "movd      0x00(%3,%1,4),%%xmm4            \n"

+      "pextrw    $0x1,%%xmm2,%k0                 \n"

+      "pextrw    $0x3,%%xmm2,%k1                 \n"

+      "punpckldq %%xmm4,%%xmm1                   \n"

+      "punpcklqdq %%xmm1,%%xmm0                  \n"

+      "movdqu    %%xmm0,(%2)                     \n"

+      "lea       0x10(%2),%2                     \n"

+      "sub       $0x4,%4                         \n"

+      "jge       40b                             \n"

-  "49:                                         \n"

-    "test      $0x2,%4                         \n"

-    "je        29f                             \n"

-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1

-    "pextrw    $0x5,%%xmm2,%k0                 \n"

-    "punpckldq %%xmm1,%%xmm0                   \n"

-    "movq      %%xmm0," MEMACCESS(2) "         \n"

-    "lea       " MEMLEA(0x8,2) ",%2            \n"

-  "29:                                         \n"

-    "test      $0x1,%4                         \n"

-    "je        99f                             \n"

-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0

-    "movd      %%xmm0," MEMACCESS(2) "         \n"

-  "99:                                         \n"

-  : "=&a"(x0),         // %0

-    "=&d"(x1),         // %1

-    "+r"(dst_argb),    // %2

-    "+r"(src_argb),    // %3

-    "+r"(dst_width)    // %4

-  : "rm"(x),           // %5

-    "rm"(dx)           // %6

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"

-  );

+      "49:                                       \n"

+      "test      $0x2,%4                         \n"

+      "je        29f                             \n"

+      "movd      0x00(%3,%0,4),%%xmm0            \n"

+      "movd      0x00(%3,%1,4),%%xmm1            \n"

+      "pextrw    $0x5,%%xmm2,%k0                 \n"

+      "punpckldq %%xmm1,%%xmm0                   \n"

+      "movq      %%xmm0,(%2)                     \n"

+      "lea       0x8(%2),%2                      \n"

+      "29:                                       \n"

+      "test      $0x1,%4                         \n"

+      "je        99f                             \n"

+      "movd      0x00(%3,%0,4),%%xmm0            \n"

+      "movd      %%xmm0,(%2)                     \n"

+      "99:                                       \n"

+      : "=&a"(x0),       // %0

+        "=&d"(x1),       // %1

+        "+r"(dst_argb),  // %2

+        "+r"(src_argb),  // %3

+        "+r"(dst_width)  // %4

+      : "rm"(x),         // %5

+        "rm"(dx)         // %6

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");

 // Reads 4 pixels, duplicates them and writes 8 pixels.

 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

-                           int dst_width, int x, int dx) {

-  asm volatile (

-    LABELALIGN

-  "1:                                          \n"

-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"

-    "lea       " MEMLEA(0x10,1) ",%1           \n"

-    "movdqa    %%xmm0,%%xmm1                   \n"

-    "punpckldq %%xmm0,%%xmm0                   \n"

-    "punpckhdq %%xmm1,%%xmm1                   \n"

-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"

-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"

-    "lea       " MEMLEA(0x20,0) ",%0           \n"

-    "sub       $0x8,%2                         \n"

-    "jg        1b                              \n"

+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,

+                           const uint8_t* src_argb,

+                           int dst_width,

+                           int x,

+                           int dx) {

+  (void)x;

+  (void)dx;

+  asm volatile(

-  : "+r"(dst_argb),    // %0

-    "+r"(src_argb),    // %1

-    "+r"(dst_width)    // %2

-  :: "memory", "cc", NACL_R14

-    "xmm0", "xmm1"

-  );

+      LABELALIGN

+      "1:                                        \n"

+      "movdqu    (%1),%%xmm0                     \n"

+      "lea       0x10(%1),%1                     \n"

+      "movdqa    %%xmm0,%%xmm1                   \n"

+      "punpckldq %%xmm0,%%xmm0                   \n"

+      "punpckhdq %%xmm1,%%xmm1                   \n"

+      "movdqu    %%xmm0,(%0)                     \n"

+      "movdqu    %%xmm1,0x10(%0)                 \n"

+      "lea       0x20(%0),%0                     \n"

+      "sub       $0x8,%2                         \n"

+      "jg        1b                              \n"

+      : "+r"(dst_argb),  // %0

+        "+r"(src_argb),  // %1

+        "+r"(dst_width)  // %2

+        ::"memory",

+        "cc", "xmm0", "xmm1");

 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw

-static uvec8 kShuffleColARGB = {

-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel

-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

+static const uvec8 kShuffleColARGB = {

+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel

+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

};

 // Shuffle table for duplicating 2 fractions into 8 bytes each

-static uvec8 kShuffleFractions = {

-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

+static const uvec8 kShuffleFractions = {

+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

};

 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version

-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

-                               int dst_width, int x, int dx) {

+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,

+                               const uint8_t* src_argb,

+                               int dst_width,

+                               int x,

+                               int dx) {

   intptr_t x0, x1;

-  asm volatile (

-    "movdqa    %0,%%xmm4                       \n"

-    "movdqa    %1,%%xmm5                       \n"

-  :

-  : "m"(kShuffleColARGB),  // %0

-    "m"(kShuffleFractions)  // %1

-  );

+  asm volatile(

+      "movdqa    %0,%%xmm4                       \n"

+      "movdqa    %1,%%xmm5                       \n"

+      :

+      : "m"(kShuffleColARGB),   // %0

+        "m"(kShuffleFractions)  // %1

+      );

-  asm volatile (

-    "movd      %5,%%xmm2                       \n"

-    "movd      %6,%%xmm3                       \n"

-    "pcmpeqb   %%xmm6,%%xmm6                   \n"

-    "psrlw     $0x9,%%xmm6                     \n"

-    "pextrw    $0x1,%%xmm2,%k3                 \n"

-    "sub       $0x2,%2                         \n"

-    "jl        29f                             \n"

-    "movdqa    %%xmm2,%%xmm0                   \n"

-    "paddd     %%xmm3,%%xmm0                   \n"

-    "punpckldq %%xmm0,%%xmm2                   \n"

-    "punpckldq %%xmm3,%%xmm3                   \n"

-    "paddd     %%xmm3,%%xmm3                   \n"

-    "pextrw    $0x3,%%xmm2,%k4                 \n"

+  asm volatile(

+      "movd      %5,%%xmm2                       \n"

+      "movd      %6,%%xmm3                       \n"

+      "pcmpeqb   %%xmm6,%%xmm6                   \n"

+      "psrlw     $0x9,%%xmm6                     \n"

+      "pextrw    $0x1,%%xmm2,%k3                 \n"

+      "sub       $0x2,%2                         \n"

+      "jl        29f                             \n"

+      "movdqa    %%xmm2,%%xmm0                   \n"

+      "paddd     %%xmm3,%%xmm0                   \n"

+      "punpckldq %%xmm0,%%xmm2                   \n"

+      "punpckldq %%xmm3,%%xmm3                   \n"

+      "paddd     %%xmm3,%%xmm3                   \n"

+      "pextrw    $0x3,%%xmm2,%k4                 \n"

-    LABELALIGN

-  "2:                                          \n"

-    "movdqa    %%xmm2,%%xmm1                   \n"

-    "paddd     %%xmm3,%%xmm2                   \n"

-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0

-    "psrlw     $0x9,%%xmm1                     \n"

-    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0

-    "pshufb    %%xmm5,%%xmm1                   \n"

-    "pshufb    %%xmm4,%%xmm0                   \n"

-    "pxor      %%xmm6,%%xmm1                   \n"

-    "pmaddubsw %%xmm1,%%xmm0                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "pextrw    $0x1,%%xmm2,%k3                 \n"

-    "pextrw    $0x3,%%xmm2,%k4                 \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movq      %%xmm0," MEMACCESS(0) "         \n"

-    "lea       " MEMLEA(0x8,0) ",%0            \n"

-    "sub       $0x2,%2                         \n"

-    "jge       2b                              \n"

+      LABELALIGN

+      "2:                                        \n"

+      "movdqa    %%xmm2,%%xmm1                   \n"

+      "paddd     %%xmm3,%%xmm2                   \n"

+      "movq      0x00(%1,%3,4),%%xmm0            \n"

+      "psrlw     $0x9,%%xmm1                     \n"

+      "movhps    0x00(%1,%4,4),%%xmm0            \n"

+      "pshufb    %%xmm5,%%xmm1                   \n"

+      "pshufb    %%xmm4,%%xmm0                   \n"

+      "pxor      %%xmm6,%%xmm1                   \n"

+      "pmaddubsw %%xmm1,%%xmm0                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "pextrw    $0x1,%%xmm2,%k3                 \n"

+      "pextrw    $0x3,%%xmm2,%k4                 \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movq      %%xmm0,(%0)                     \n"

+      "lea       0x8(%0),%0                      \n"

+      "sub       $0x2,%2                         \n"

+      "jge       2b                              \n"

-    LABELALIGN

-  "29:                                         \n"

-    "add       $0x1,%2                         \n"

-    "jl        99f                             \n"

-    "psrlw     $0x9,%%xmm2                     \n"

-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0

-    "pshufb    %%xmm5,%%xmm2                   \n"

-    "pshufb    %%xmm4,%%xmm0                   \n"

-    "pxor      %%xmm6,%%xmm2                   \n"

-    "pmaddubsw %%xmm2,%%xmm0                   \n"

-    "psrlw     $0x7,%%xmm0                     \n"

-    "packuswb  %%xmm0,%%xmm0                   \n"

-    "movd      %%xmm0," MEMACCESS(0) "         \n"

+      LABELALIGN

+      "29:                                       \n"

+      "add       $0x1,%2                         \n"

+      "jl        99f                             \n"

+      "psrlw     $0x9,%%xmm2                     \n"

+      "movq      0x00(%1,%3,4),%%xmm0            \n"

+      "pshufb    %%xmm5,%%xmm2                   \n"

+      "pshufb    %%xmm4,%%xmm0                   \n"

+      "pxor      %%xmm6,%%xmm2                   \n"

+      "pmaddubsw %%xmm2,%%xmm0                   \n"

+      "psrlw     $0x7,%%xmm0                     \n"

+      "packuswb  %%xmm0,%%xmm0                   \n"

+      "movd      %%xmm0,(%0)                     \n"

-    LABELALIGN

-  "99:                                         \n"

-  : "+r"(dst_argb),    // %0

-    "+r"(src_argb),    // %1

-    "+rm"(dst_width),  // %2

-    "=&r"(x0),         // %3

-    "=&r"(x1)          // %4

-  : "rm"(x),           // %5

-    "rm"(dx)           // %6

-  : "memory", "cc", NACL_R14

-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"

-  );

+      LABELALIGN "99:                            \n"  // clang-format error.

+      : "+r"(dst_argb),    // %0

+        "+r"(src_argb),    // %1

+        "+rm"(dst_width),  // %2

+        "=&r"(x0),         // %3

+        "=&r"(x1)          // %4

+      : "rm"(x),           // %5

+        "rm"(dx)           // %6

+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");

 // Divide num by div and return as 16.16 fixed point result.

 int FixedDiv_X86(int num, int div) {

-  asm volatile (

-    "cdq                                       \n"

-    "shld      $0x10,%%eax,%%edx               \n"

-    "shl       $0x10,%%eax                     \n"

-    "idiv      %1                              \n"

-    "mov       %0, %%eax                       \n"

-    : "+a"(num)  // %0

-    : "c"(div)   // %1

-    : "memory", "cc", "edx"

-  );

+  asm volatile(

+      "cdq                                       \n"

+      "shld      $0x10,%%eax,%%edx               \n"

+      "shl       $0x10,%%eax                     \n"

+      "idiv      %1                              \n"

+      "mov       %0, %%eax                       \n"

+      : "+a"(num)  // %0

+      : "c"(div)   // %1

+      : "memory", "cc", "edx");

   return num;

 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.

 int FixedDiv1_X86(int num, int div) {

-  asm volatile (

-    "cdq                                       \n"

-    "shld      $0x10,%%eax,%%edx               \n"

-    "shl       $0x10,%%eax                     \n"

-    "sub       $0x10001,%%eax                  \n"

-    "sbb       $0x0,%%edx                      \n"

-    "sub       $0x1,%1                         \n"

-    "idiv      %1                              \n"

-    "mov       %0, %%eax                       \n"

-    : "+a"(num)  // %0

-    : "c"(div)   // %1

-    : "memory", "cc", "edx"

-  );

+  asm volatile(

+      "cdq                                       \n"

+      "shld      $0x10,%%eax,%%edx               \n"

+      "shl       $0x10,%%eax                     \n"

+      "sub       $0x10001,%%eax                  \n"

+      "sbb       $0x0,%%edx                      \n"

+      "sub       $0x1,%1                         \n"

+      "idiv      %1                              \n"

+      "mov       %0, %%eax                       \n"

+      : "+a"(num)  // %0

+      : "c"(div)   // %1

+      : "memory", "cc", "edx");

   return num;

--- a/third_party/libyuv/source/scale_mips.cc

+++ /dev/null

@@ -1,644 +1,0 @@

-/*

- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS. All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "libyuv/basic_types.h"

-#include "libyuv/row.h"

-#ifdef __cplusplus

-namespace libyuv {

-extern "C" {

-#endif

-// This module is for GCC MIPS DSPR2

-#if !defined(LIBYUV_DISABLE_MIPS) && \

-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \

-    (_MIPS_SIM == _MIPS_SIM_ABI32)

-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst, int dst_width) {

-  __asm__ __volatile__(

-    ".set push                                     \n"

-    ".set noreorder                                \n"

-    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16

-    "beqz           $t9, 2f                        \n"

-    " nop                                          \n"

-  "1:                                              \n"

-    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|

-    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|

-    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|

-    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|

-    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|

-    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|

-    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|

-    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|

-    // TODO(fbarchard): Use odd pixels instead of even.

-    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|

-    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|

-    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|

-    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|

-    "addiu          %[src_ptr], %[src_ptr], 32     \n"

-    "addiu          $t9, $t9, -1                   \n"

-    "sw             $t8, 0(%[dst])                 \n"

-    "sw             $t0, 4(%[dst])                 \n"

-    "sw             $t1, 8(%[dst])                 \n"

-    "sw             $t2, 12(%[dst])                \n"

-    "bgtz           $t9, 1b                        \n"

-    " addiu         %[dst], %[dst], 16             \n"

-  "2:                                              \n"

-    "andi           $t9, %[dst_width], 0xf         \n"  // residue

-    "beqz           $t9, 3f                        \n"

-    " nop                                          \n"

-  "21:                                             \n"

-    "lbu            $t0, 0(%[src_ptr])             \n"

-    "addiu          %[src_ptr], %[src_ptr], 2      \n"

-    "addiu          $t9, $t9, -1                   \n"

-    "sb             $t0, 0(%[dst])                 \n"

-    "bgtz           $t9, 21b                       \n"

-    " addiu         %[dst], %[dst], 1              \n"

-  "3:                                              \n"

-    ".set pop                                      \n"

-  : [src_ptr] "+r" (src_ptr),

-    [dst] "+r" (dst)

-  : [dst_width] "r" (dst_width)

-  : "t0", "t1", "t2", "t3", "t4", "t5",

-    "t6", "t7", "t8", "t9"

-  );

-}

-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width) {

-  const uint8* t = src_ptr + src_stride;

-  __asm__ __volatile__ (

-    ".set push                                    \n"

-    ".set noreorder                               \n"

-    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8

-    "bltz           $t9, 2f                       \n"

-    " nop                                         \n"

-  "1:                                             \n"

-    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|

-    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|

-    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|

-    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|

-    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|

-    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|

-    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|

-    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|

-    "addiu          $t9, $t9, -1                  \n"

-    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|

-    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|

-    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|

-    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|

-    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|

-    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2

-    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2

-    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|

-    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|

-    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|

-    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|

-    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|

-    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2

-    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2

-    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|

-    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|

-    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|

-    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|

-    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|

-    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2

-    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2

-    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|

-    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|

-    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|

-    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|

-    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|

-    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2

-    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2

-    "addiu          %[src_ptr], %[src_ptr], 16    \n"

-    "addiu          %[t], %[t], 16                \n"

-    "sb             $t0, 0(%[dst])                \n"

-    "sb             $t4, 1(%[dst])                \n"

-    "sb             $t1, 2(%[dst])                \n"

-    "sb             $t5, 3(%[dst])                \n"

-    "sb             $t2, 4(%[dst])                \n"

-    "sb             $t6, 5(%[dst])                \n"

-    "sb             $t3, 6(%[dst])                \n"

-    "sb             $t7, 7(%[dst])                \n"

-    "bgtz           $t9, 1b                       \n"

-    " addiu         %[dst], %[dst], 8             \n"

-  "2:                                             \n"

-    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue

-    "beqz           $t9, 3f                       \n"

-    " nop                                         \n"

-    "21:                                          \n"

-    "lwr            $t1, 0(%[src_ptr])            \n"

-    "lwl            $t1, 3(%[src_ptr])            \n"

-    "lwr            $t2, 0(%[t])                  \n"

-    "lwl            $t2, 3(%[t])                  \n"

-    "srl            $t8, $t1, 16                  \n"

-    "ins            $t1, $t2, 16, 16              \n"

-    "ins            $t2, $t8, 0, 16               \n"

-    "raddu.w.qb     $t1, $t1                      \n"

-    "raddu.w.qb     $t2, $t2                      \n"

-    "shra_r.w       $t1, $t1, 2                   \n"

-    "shra_r.w       $t2, $t2, 2                   \n"

-    "sb             $t1, 0(%[dst])                \n"

-    "sb             $t2, 1(%[dst])                \n"

-    "addiu          %[src_ptr], %[src_ptr], 4     \n"

-    "addiu          $t9, $t9, -2                  \n"

-    "addiu          %[t], %[t], 4                 \n"

-    "bgtz           $t9, 21b                      \n"

-    " addiu         %[dst], %[dst], 2             \n"

-  "3:                                             \n"

-    ".set pop                                     \n"

-  : [src_ptr] "+r" (src_ptr),

-    [dst] "+r" (dst), [t] "+r" (t)

-  : [dst_width] "r" (dst_width)

-  : "t0", "t1", "t2", "t3", "t4", "t5",

-    "t6", "t7", "t8", "t9"

-  );

-}

-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst, int dst_width) {

-  __asm__ __volatile__ (

-      ".set push                                    \n"

-      ".set noreorder                               \n"

-      "srl            $t9, %[dst_width], 3          \n"

-      "beqz           $t9, 2f                       \n"

-      " nop                                         \n"

-     "1:                                            \n"

-      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|

-      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|

-      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|

-      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|

-      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|

-      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|

-      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|

-      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|

-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|

-      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|

-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|

-      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|

-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|

-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|

-      "addiu          %[src_ptr], %[src_ptr], 32    \n"

-      "addiu          $t9, $t9, -1                  \n"

-      "sw             $t1, 0(%[dst])                \n"

-      "sw             $t5, 4(%[dst])                \n"

-      "bgtz           $t9, 1b                       \n"

-      " addiu         %[dst], %[dst], 8             \n"

-    "2:                                             \n"

-      "andi           $t9, %[dst_width], 7          \n"  // residue

-      "beqz           $t9, 3f                       \n"

-      " nop                                         \n"

-    "21:                                            \n"

-      "lbu            $t1, 0(%[src_ptr])            \n"

-      "addiu          %[src_ptr], %[src_ptr], 4     \n"

-      "addiu          $t9, $t9, -1                  \n"

-      "sb             $t1, 0(%[dst])                \n"

-      "bgtz           $t9, 21b                      \n"

-      " addiu         %[dst], %[dst], 1             \n"

-    "3:                                             \n"

-      ".set pop                                     \n"

-      : [src_ptr] "+r" (src_ptr),

-        [dst] "+r" (dst)

-      : [dst_width] "r" (dst_width)

-      : "t1", "t2", "t3", "t4", "t5",

-        "t6", "t7", "t8", "t9"

-  );

-}

-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width) {

-  intptr_t stride = src_stride;

-  const uint8* s1 = src_ptr + stride;

-  const uint8* s2 = s1 + stride;

-  const uint8* s3 = s2 + stride;

-  __asm__ __volatile__ (

-      ".set push                                  \n"

-      ".set noreorder                             \n"

-      "srl           $t9, %[dst_width], 1         \n"

-      "andi          $t8, %[dst_width], 1         \n"

-     "1:                                          \n"

-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|

-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|

-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|

-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|

-      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|

-      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|

-      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|

-      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|

-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|

-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|

-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|

-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|

-      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|

-      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|

-      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|

-      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|

-      "add           $t0, $t0, $t1                \n"

-      "add           $t1, $t2, $t3                \n"

-      "add           $t0, $t0, $t1                \n"

-      "add           $t4, $t4, $t5                \n"

-      "add           $t6, $t6, $t7                \n"

-      "add           $t4, $t4, $t6                \n"

-      "shra_r.w      $t0, $t0, 4                  \n"

-      "shra_r.w      $t4, $t4, 4                  \n"

-      "sb            $t0, 0(%[dst])               \n"

-      "sb            $t4, 1(%[dst])               \n"

-      "addiu         %[src_ptr], %[src_ptr], 8    \n"

-      "addiu         %[s1], %[s1], 8              \n"

-      "addiu         %[s2], %[s2], 8              \n"

-      "addiu         %[s3], %[s3], 8              \n"

-      "addiu         $t9, $t9, -1                 \n"

-      "bgtz          $t9, 1b                      \n"

-      " addiu        %[dst], %[dst], 2            \n"

-      "beqz          $t8, 2f                      \n"

-      " nop                                       \n"

-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|

-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|

-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|

-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|

-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|

-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|

-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|

-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|

-      "add           $t0, $t0, $t1                \n"

-      "add           $t1, $t2, $t3                \n"

-      "add           $t0, $t0, $t1                \n"

-      "shra_r.w      $t0, $t0, 4                  \n"

-      "sb            $t0, 0(%[dst])               \n"

-      "2:                                         \n"

-      ".set pop                                   \n"

-      : [src_ptr] "+r" (src_ptr),

-        [dst] "+r" (dst),

-        [s1] "+r" (s1),

-        [s2] "+r" (s2),

-        [s3] "+r" (s3)

-      : [dst_width] "r" (dst_width)

-      : "t0", "t1", "t2", "t3", "t4", "t5",

-        "t6","t7", "t8", "t9"

-  );

-}

-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst, int dst_width) {

-  __asm__ __volatile__ (

-      ".set push                                          \n"

-      ".set noreorder                                     \n"

-    "1:                                                   \n"

-      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|

-      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|

-      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|

-      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|

-      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|

-      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|

-      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|

-      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|

-      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|

-      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|

-      "addiu           %[dst_width], %[dst_width], -24    \n"

-      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|

-      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|

-      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|

-      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|

-      "addiu           %[src_ptr], %[src_ptr], 32         \n"

-      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|

-      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|

-      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|

-      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|

-      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|

-      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|

-      "sw              $t1, 0(%[dst])                     \n"

-      "sw              $t0, 4(%[dst])                     \n"

-      "sw              $t3, 8(%[dst])                     \n"

-      "sw              $t5, 12(%[dst])                    \n"

-      "sw              $t9, 16(%[dst])                    \n"

-      "sw              $t7, 20(%[dst])                    \n"

-      "bnez            %[dst_width], 1b                   \n"

-      " addiu          %[dst], %[dst], 24                 \n"

-      ".set pop                                           \n"

-      : [src_ptr] "+r" (src_ptr),

-        [dst] "+r" (dst),

-        [dst_width] "+r" (dst_width)

-      :

-      : "t0", "t1", "t2", "t3", "t4", "t5",

-        "t6","t7", "t8", "t9"

-  );

-}

-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* d, int dst_width) {

-  __asm__ __volatile__ (

-      ".set push                                         \n"

-      ".set noreorder                                    \n"

-      "repl.ph           $t3, 3                          \n"  // 0x00030003

-    "1:                                                  \n"

-      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|

-      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|

-      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|

-      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|

-      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|

-      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|

-      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|

-      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|

-      "raddu.w.qb        $t0, $t0                        \n"

-      "raddu.w.qb        $t1, $t1                        \n"

-      "shra_r.w          $t0, $t0, 1                     \n"

-      "shra_r.w          $t1, $t1, 1                     \n"

-      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|

-      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|

-      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|

-      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|

-      "addu.ph           $t2, $t2, $t4                   \n"

-      "addu.ph           $t6, $t6, $t5                   \n"

-      "sll               $t5, $t0, 1                     \n"

-      "add               $t0, $t5, $t0                   \n"

-      "shra_r.ph         $t2, $t2, 2                     \n"

-      "shra_r.ph         $t6, $t6, 2                     \n"

-      "shll.ph           $t4, $t2, 1                     \n"

-      "addq.ph           $t4, $t4, $t2                   \n"

-      "addu              $t0, $t0, $t1                   \n"

-      "addiu             %[src_ptr], %[src_ptr], 4       \n"

-      "shra_r.w          $t0, $t0, 2                     \n"

-      "addu.ph           $t6, $t6, $t4                   \n"

-      "shra_r.ph         $t6, $t6, 2                     \n"

-      "srl               $t1, $t6, 16                    \n"

-      "addiu             %[dst_width], %[dst_width], -3  \n"

-      "sb                $t1, 0(%[d])                    \n"

-      "sb                $t0, 1(%[d])                    \n"

-      "sb                $t6, 2(%[d])                    \n"

-      "bgtz              %[dst_width], 1b                \n"

-      " addiu            %[d], %[d], 3                   \n"

-    "3:                                                  \n"

-      ".set pop                                          \n"

-      : [src_ptr] "+r" (src_ptr),

-        [src_stride] "+r" (src_stride),

-        [d] "+r" (d),

-        [dst_width] "+r" (dst_width)

-      :

-      : "t0", "t1", "t2", "t3",

-        "t4", "t5", "t6"

-  );

-}

-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* d, int dst_width) {

-  __asm__ __volatile__ (

-      ".set push                                           \n"

-      ".set noreorder                                      \n"

-      "repl.ph           $t2, 3                            \n"  // 0x00030003

-    "1:                                                    \n"

-      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

-      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|

-      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|

-      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|

-      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|

-      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|

-      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|

-      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|

-      "raddu.w.qb        $t0, $t0                          \n"

-      "raddu.w.qb        $t1, $t1                          \n"

-      "shra_r.w          $t0, $t0, 1                       \n"

-      "shra_r.w          $t1, $t1, 1                       \n"

-      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|

-      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|

-      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|

-      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|

-      "addu.ph           $t4, $t4, $t3                     \n"

-      "addu.ph           $t6, $t6, $t5                     \n"

-      "shra_r.ph         $t6, $t6, 2                       \n"

-      "shra_r.ph         $t4, $t4, 2                       \n"

-      "addu.ph           $t6, $t6, $t4                     \n"

-      "addiu             %[src_ptr], %[src_ptr], 4         \n"

-      "shra_r.ph         $t6, $t6, 1                       \n"

-      "addu              $t0, $t0, $t1                     \n"

-      "addiu             %[dst_width], %[dst_width], -3    \n"

-      "shra_r.w          $t0, $t0, 1                       \n"

-      "srl               $t1, $t6, 16                      \n"

-      "sb                $t1, 0(%[d])                      \n"

-      "sb                $t0, 1(%[d])                      \n"

-      "sb                $t6, 2(%[d])                      \n"

-      "bgtz              %[dst_width], 1b                  \n"

-      " addiu            %[d], %[d], 3                     \n"

-    "3:                                                    \n"

-      ".set pop                                            \n"

-      : [src_ptr] "+r" (src_ptr),

-        [src_stride] "+r" (src_stride),

-        [d] "+r" (d),

-        [dst_width] "+r" (dst_width)

-      :

-      : "t0", "t1", "t2", "t3",

-        "t4", "t5", "t6"

-  );

-}

-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst, int dst_width) {

-  __asm__ __volatile__ (

-      ".set push                                     \n"

-      ".set noreorder                                \n"

-    "1:                                              \n"

-      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|

-      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|

-      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|

-      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|

-      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|

-      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|

-      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|

-      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|

-      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|

-      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|

-      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|

-      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|

-      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|

-      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|

-      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|

-      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|

-      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|

-      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|

-      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|

-      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|

-      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|

-      "addiu      %[src_ptr], %[src_ptr], 32         \n"

-      "addiu      %[dst_width], %[dst_width], -12    \n"

-      "addiu      $t8,%[dst_width], -12              \n"

-      "sw         $t1, 0(%[dst])                     \n"

-      "sw         $t4, 4(%[dst])                     \n"

-      "sw         $t6, 8(%[dst])                     \n"

-      "bgez       $t8, 1b                            \n"

-      " addiu     %[dst], %[dst], 12                 \n"

-      ".set pop                                      \n"

-      : [src_ptr] "+r" (src_ptr),

-        [dst] "+r" (dst),

-        [dst_width] "+r" (dst_width)

-      :

-      : "t0", "t1", "t2", "t3", "t4",

-        "t5", "t6", "t7", "t8"

-  );

-}

-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  intptr_t stride = src_stride;

-  const uint8* t = src_ptr + stride;

-  const int c = 0x2AAA;

-  __asm__ __volatile__ (

-      ".set push                                         \n"

-      ".set noreorder                                    \n"

-    "1:                                                  \n"

-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

-      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|

-      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|

-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|

-      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|

-      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|

-      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6

-      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4

-      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|

-      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|

-      "srl             $t4, $t4, 2                       \n"  // t4 / 4

-      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|

-      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3

-      "addu            $t6, $t5, $t6                     \n"

-      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA

-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|

-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|

-      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0

-      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0

-      "addu            $t0, $t0, $t2                     \n"

-      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA

-      "addiu           %[src_ptr], %[src_ptr], 8         \n"

-      "addiu           %[t], %[t], 8                     \n"

-      "addiu           %[dst_width], %[dst_width], -3    \n"

-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"

-      "srl             $t6, $t6, 16                      \n"

-      "srl             $t0, $t0, 16                      \n"

-      "sb              $t4, -1(%[dst_ptr])               \n"

-      "sb              $t6, -2(%[dst_ptr])               \n"

-      "bgtz            %[dst_width], 1b                  \n"

-      " sb             $t0, -3(%[dst_ptr])               \n"

-      ".set pop                                          \n"

-      : [src_ptr] "+r" (src_ptr),

-        [dst_ptr] "+r" (dst_ptr),

-        [t] "+r" (t),

-        [dst_width] "+r" (dst_width)

-      : [c] "r" (c)

-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"

-  );

-}

-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,

-                                ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

-  intptr_t stride = src_stride;

-  const uint8* s1 = src_ptr + stride;

-  stride += stride;

-  const uint8* s2 = src_ptr + stride;

-  const int c1 = 0x1C71;

-  const int c2 = 0x2AAA;

-  __asm__ __volatile__ (

-      ".set push                                         \n"

-      ".set noreorder                                    \n"

-    "1:                                                  \n"

-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|

-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|

-      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|

-      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|

-      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|

-      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|

-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|

-      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|

-      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6

-      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|

-      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4

-      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|

-      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4

-      "addu            $t7, $t7, $t8                     \n"

-      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|

-      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6

-      "addu            $t6, $t6, $t8                     \n"

-      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA

-      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|

-      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|

-      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|

-      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3

-      "addu            $t7, $t7, $t8                     \n"

-      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71

-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|

-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|

-      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|

-      "raddu.w.qb      $t0, $t0                          \n"

-      "raddu.w.qb      $t2, $t2                          \n"

-      "raddu.w.qb      $t4, $t4                          \n"

-      "addu            $t0, $t0, $t2                     \n"

-      "addu            $t0, $t0, $t4                     \n"

-      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71

-      "addiu           %[src_ptr], %[src_ptr], 8         \n"

-      "addiu           %[s1], %[s1], 8                   \n"

-      "addiu           %[s2], %[s2], 8                   \n"

-      "addiu           %[dst_width], %[dst_width], -3    \n"

-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"

-      "srl             $t6, $t6, 16                      \n"

-      "srl             $t7, $t7, 16                      \n"

-      "srl             $t0, $t0, 16                      \n"

-      "sb              $t6, -1(%[dst_ptr])               \n"

-      "sb              $t7, -2(%[dst_ptr])               \n"

-      "bgtz            %[dst_width], 1b                  \n"

-      " sb             $t0, -3(%[dst_ptr])               \n"

-      ".set pop                                          \n"

-      : [src_ptr] "+r" (src_ptr),

-        [dst_ptr] "+r" (dst_ptr),

-        [s1] "+r" (s1),

-        [s2] "+r" (s2),

-        [dst_width] "+r" (dst_width)

-      : [c1] "r" (c1), [c2] "r" (c2)

-      : "t0", "t1", "t2", "t3", "t4",

-        "t5", "t6", "t7", "t8"

-  );

-}

-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)

-#ifdef __cplusplus

-}  // extern "C"

-}  // namespace libyuv

-#endif

--- /dev/null

+++ b/third_party/libyuv/source/scale_msa.cc

@@ -1,0 +1,949 @@

+/*

+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS. All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include "libyuv/scale_row.h"

+// This module is for GCC MSA

+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

+#include "libyuv/macros_msa.h"

+#ifdef __cplusplus

+namespace libyuv {

+extern "C" {

+#endif

+#define LOAD_INDEXED_DATA(srcp, indx0, out0) \

+  {                                          \

+    out0[0] = srcp[indx0[0]];                \

+    out0[1] = srcp[indx0[1]];                \

+    out0[2] = srcp[indx0[2]];                \

+    out0[3] = srcp[indx0[3]];                \

+  }

+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_argb,

+                           int dst_width) {

+  int x;

+  v16u8 src0, src1, dst0;

+  (void)src_stride;

+  for (x = 0; x < dst_width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);

+    dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);

+    ST_UB(dst0, dst_argb);

+    src_argb += 32;

+    dst_argb += 16;

+  }

+}

+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,

+                                 ptrdiff_t src_stride,

+                                 uint8_t* dst_argb,

+                                 int dst_width) {

+  int x;

+  v16u8 src0, src1, vec0, vec1, dst0;

+  (void)src_stride;

+  for (x = 0; x < dst_width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);

+    vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);

+    vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);

+    dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);

+    ST_UB(dst0, dst_argb);

+    src_argb += 32;

+    dst_argb += 16;

+  }

+}

+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_argb,

+                              int dst_width) {

+  int x;

+  const uint8_t* s = src_argb;

+  const uint8_t* t = src_argb + src_stride;

+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;

+  v8u16 reg0, reg1, reg2, reg3;

+  v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};

+  for (x = 0; x < dst_width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);

+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);

+    vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);

+    vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);

+    vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);

+    vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);

+    reg0 = __msa_hadd_u_h(vec0, vec0);

+    reg1 = __msa_hadd_u_h(vec1, vec1);

+    reg2 = __msa_hadd_u_h(vec2, vec2);

+    reg3 = __msa_hadd_u_h(vec3, vec3);

+    reg0 += reg2;

+    reg1 += reg3;

+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);

+    reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

+    ST_UB(dst0, dst_argb);

+    s += 32;

+    t += 32;

+    dst_argb += 16;

+  }

+}

+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,

+                              ptrdiff_t src_stride,

+                              int32_t src_stepx,

+                              uint8_t* dst_argb,

+                              int dst_width) {

+  int x;

+  int32_t stepx = src_stepx * 4;

+  int32_t data0, data1, data2, data3;

+  (void)src_stride;

+  for (x = 0; x < dst_width; x += 4) {

+    data0 = LW(src_argb);

+    data1 = LW(src_argb + stepx);

+    data2 = LW(src_argb + stepx * 2);

+    data3 = LW(src_argb + stepx * 3);

+    SW(data0, dst_argb);

+    SW(data1, dst_argb + 4);

+    SW(data2, dst_argb + 8);

+    SW(data3, dst_argb + 12);

+    src_argb += stepx * 4;

+    dst_argb += 16;

+  }

+}

+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,

+                                 ptrdiff_t src_stride,

+                                 int src_stepx,

+                                 uint8_t* dst_argb,

+                                 int dst_width) {

+  int x;

+  const uint8_t* nxt_argb = src_argb + src_stride;

+  int32_t stepx = src_stepx * 4;

+  int64_t data0, data1, data2, data3;

+  v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};

+  v16u8 vec0, vec1, vec2, vec3;

+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;

+  v16u8 dst0;

+  for (x = 0; x < dst_width; x += 4) {

+    data0 = LD(src_argb);

+    data1 = LD(src_argb + stepx);

+    data2 = LD(src_argb + stepx * 2);

+    data3 = LD(src_argb + stepx * 3);

+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);

+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);

+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);

+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);

+    data0 = LD(nxt_argb);

+    data1 = LD(nxt_argb + stepx);

+    data2 = LD(nxt_argb + stepx * 2);

+    data3 = LD(nxt_argb + stepx * 3);

+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);

+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);

+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);

+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);

+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

+    vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

+    reg0 = __msa_hadd_u_h(vec0, vec0);

+    reg1 = __msa_hadd_u_h(vec1, vec1);

+    reg2 = __msa_hadd_u_h(vec2, vec2);

+    reg3 = __msa_hadd_u_h(vec3, vec3);

+    reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);

+    reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);

+    reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);

+    reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);

+    reg4 += reg6;

+    reg5 += reg7;

+    reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);

+    reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);

+    ST_UB(dst0, dst_argb);

+    src_argb += stepx * 4;

+    nxt_argb += stepx * 4;

+    dst_argb += 16;

+  }

+}

+void ScaleRowDown2_MSA(const uint8_t* src_ptr,

+                       ptrdiff_t src_stride,

+                       uint8_t* dst,

+                       int dst_width) {

+  int x;

+  v16u8 src0, src1, src2, src3, dst0, dst1;

+  (void)src_stride;

+  for (x = 0; x < dst_width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);

+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    ST_UB2(dst0, dst1, dst, 16);

+    src_ptr += 64;

+    dst += 32;

+  }

+}

+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,

+                             ptrdiff_t src_stride,

+                             uint8_t* dst,

+                             int dst_width) {

+  int x;

+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;

+  (void)src_stride;

+  for (x = 0; x < dst_width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);

+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);

+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);

+    dst0 = __msa_aver_u_b(vec1, vec0);

+    dst1 = __msa_aver_u_b(vec3, vec2);

+    ST_UB2(dst0, dst1, dst, 16);

+    src_ptr += 64;

+    dst += 32;

+  }

+}

+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst,

+                          int dst_width) {

+  int x;

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;

+  v8u16 vec0, vec1, vec2, vec3;

+  for (x = 0; x < dst_width; x += 32) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);

+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);

+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);

+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);

+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);

+    vec0 = __msa_hadd_u_h(src0, src0);

+    vec1 = __msa_hadd_u_h(src1, src1);

+    vec2 = __msa_hadd_u_h(src2, src2);

+    vec3 = __msa_hadd_u_h(src3, src3);

+    vec0 += __msa_hadd_u_h(src4, src4);

+    vec1 += __msa_hadd_u_h(src5, src5);

+    vec2 += __msa_hadd_u_h(src6, src6);

+    vec3 += __msa_hadd_u_h(src7, src7);

+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);

+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);

+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);

+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);

+    ST_UB2(dst0, dst1, dst, 16);

+    s += 64;

+    t += 64;

+    dst += 32;

+  }

+}

+void ScaleRowDown4_MSA(const uint8_t* src_ptr,

+                       ptrdiff_t src_stride,

+                       uint8_t* dst,

+                       int dst_width) {

+  int x;

+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;

+  (void)src_stride;

+  for (x = 0; x < dst_width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);

+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);

+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);

+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst);

+    src_ptr += 64;

+    dst += 16;

+  }

+}

+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          uint8_t* dst,

+                          int dst_width) {

+  int x;

+  const uint8_t* s = src_ptr;

+  const uint8_t* t0 = s + src_stride;

+  const uint8_t* t1 = s + src_stride * 2;

+  const uint8_t* t2 = s + src_stride * 3;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;

+  v8u16 vec0, vec1, vec2, vec3;

+  v4u32 reg0, reg1, reg2, reg3;

+  for (x = 0; x < dst_width; x += 16) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);

+    src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);

+    src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);

+    src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);

+    src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);

+    vec0 = __msa_hadd_u_h(src0, src0);

+    vec1 = __msa_hadd_u_h(src1, src1);

+    vec2 = __msa_hadd_u_h(src2, src2);

+    vec3 = __msa_hadd_u_h(src3, src3);

+    vec0 += __msa_hadd_u_h(src4, src4);

+    vec1 += __msa_hadd_u_h(src5, src5);

+    vec2 += __msa_hadd_u_h(src6, src6);

+    vec3 += __msa_hadd_u_h(src7, src7);

+    src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);

+    src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);

+    src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);

+    src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);

+    src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);

+    vec0 += __msa_hadd_u_h(src0, src0);

+    vec1 += __msa_hadd_u_h(src1, src1);

+    vec2 += __msa_hadd_u_h(src2, src2);

+    vec3 += __msa_hadd_u_h(src3, src3);

+    vec0 += __msa_hadd_u_h(src4, src4);

+    vec1 += __msa_hadd_u_h(src5, src5);

+    vec2 += __msa_hadd_u_h(src6, src6);

+    vec3 += __msa_hadd_u_h(src7, src7);

+    reg0 = __msa_hadd_u_w(vec0, vec0);

+    reg1 = __msa_hadd_u_w(vec1, vec1);

+    reg2 = __msa_hadd_u_w(vec2, vec2);

+    reg3 = __msa_hadd_u_w(vec3, vec3);

+    reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);

+    reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);

+    reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);

+    reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);

+    ST_UB(dst0, dst);

+    s += 64;

+    t0 += 64;

+    t1 += 64;

+    t2 += 64;

+    dst += 16;

+  }

+}

+void ScaleRowDown38_MSA(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width) {

+  int x, width;

+  uint64_t dst0;

+  uint32_t dst1;

+  v16u8 src0, src1, vec0;

+  v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};

+  (void)src_stride;

+  assert(dst_width % 3 == 0);

+  width = dst_width / 3;

+  for (x = 0; x < width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);

+    vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);

+    dst0 = __msa_copy_u_d((v2i64)vec0, 0);

+    dst1 = __msa_copy_u_w((v4i32)vec0, 2);

+    SD(dst0, dst);

+    SW(dst1, dst + 8);

+    src_ptr += 32;

+    dst += 12;

+  }

+}

+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width) {

+  int x, width;

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

+  uint64_t dst0;

+  uint32_t dst1;

+  v16u8 src0, src1, src2, src3, out;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;

+  v8i16 zero = {0};

+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};

+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};

+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);

+  v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  width = dst_width / 3;

+  for (x = 0; x < width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);

+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);

+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);

+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);

+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);

+    vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);

+    vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);

+    vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);

+    vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);

+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);

+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);

+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);

+    tmp0 = __msa_hadd_u_w(vec4, vec4);

+    tmp1 = __msa_hadd_u_w(vec5, vec5);

+    tmp2 = __msa_hadd_u_w(vec6, vec6);

+    tmp3 = __msa_hadd_u_w(vec7, vec7);

+    tmp4 = __msa_hadd_u_w(vec0, vec0);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);

+    tmp0 = __msa_hadd_u_w(vec0, vec0);

+    tmp1 = __msa_hadd_u_w(vec1, vec1);

+    tmp0 *= const_0x2AAA;

+    tmp1 *= const_0x2AAA;

+    tmp4 *= const_0x4000;

+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);

+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);

+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);

+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);

+    dst0 = __msa_copy_u_d((v2i64)out, 0);

+    dst1 = __msa_copy_u_w((v4i32)out, 2);

+    SD(dst0, dst_ptr);

+    SW(dst1, dst_ptr + 8);

+    s += 32;

+    t += 32;

+    dst_ptr += 12;

+  }

+}

+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst_ptr,

+                              int dst_width) {

+  int x, width;

+  const uint8_t* s = src_ptr;

+  const uint8_t* t0 = s + src_stride;

+  const uint8_t* t1 = s + src_stride * 2;

+  uint64_t dst0;

+  uint32_t dst1;

+  v16u8 src0, src1, src2, src3, src4, src5, out;

+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;

+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;

+  v8u16 zero = {0};

+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};

+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};

+  v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);

+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  width = dst_width / 3;

+  for (x = 0; x < width; x += 4) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);

+    src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);

+    src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);

+    src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);

+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

+    vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);

+    vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);

+    vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);

+    vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);

+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);

+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);

+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);

+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);

+    vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);

+    vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);

+    vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);

+    vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);

+    vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);

+    vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);

+    vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);

+    vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);

+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);

+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);

+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);

+    tmp0 = __msa_hadd_u_w(vec4, vec4);

+    tmp1 = __msa_hadd_u_w(vec5, vec5);

+    tmp2 = __msa_hadd_u_w(vec6, vec6);

+    tmp3 = __msa_hadd_u_w(vec7, vec7);

+    tmp4 = __msa_hadd_u_w(vec0, vec0);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);

+    tmp0 = __msa_hadd_u_w(vec0, vec0);

+    tmp1 = __msa_hadd_u_w(vec1, vec1);

+    tmp0 *= const_0x1C71;

+    tmp1 *= const_0x1C71;

+    tmp4 *= const_0x2AAA;

+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);

+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);

+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);

+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);

+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);

+    dst0 = __msa_copy_u_d((v2i64)out, 0);

+    dst1 = __msa_copy_u_w((v4i32)out, 2);

+    SD(dst0, dst_ptr);

+    SW(dst1, dst_ptr + 8);

+    s += 32;

+    t0 += 32;

+    t1 += 32;

+    dst_ptr += 12;

+  }

+}

+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {

+  int x;

+  v16u8 src0;

+  v8u16 dst0, dst1;

+  v16i8 zero = {0};

+  assert(src_width > 0);

+  for (x = 0; x < src_width; x += 16) {

+    src0 = LD_UB(src_ptr);

+    dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);

+    dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);

+    dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);

+    dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);

+    ST_UH2(dst0, dst1, dst_ptr, 8);

+    src_ptr += 16;

+    dst_ptr += 16;

+  }

+}

+void ScaleFilterCols_MSA(uint8_t* dst_ptr,

+                         const uint8_t* src_ptr,

+                         int dst_width,

+                         int x,

+                         int dx) {

+  int j;

+  v4i32 vec_x = __msa_fill_w(x);

+  v4i32 vec_dx = __msa_fill_w(dx);

+  v4i32 vec_const = {0, 1, 2, 3};

+  v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;

+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;

+  v8u16 reg0, reg1;

+  v16u8 dst0;

+  v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);

+  v4i32 const_0x40 = __msa_fill_w(0x40);

+  vec0 = vec_dx * vec_const;

+  vec1 = vec_dx * 4;

+  vec_x += vec0;

+  for (j = 0; j < dst_width - 1; j += 16) {

+    vec2 = vec_x >> 16;

+    vec6 = vec_x & const_0xFFFF;

+    vec_x += vec1;

+    vec3 = vec_x >> 16;

+    vec7 = vec_x & const_0xFFFF;

+    vec_x += vec1;

+    vec4 = vec_x >> 16;

+    vec8 = vec_x & const_0xFFFF;

+    vec_x += vec1;

+    vec5 = vec_x >> 16;

+    vec9 = vec_x & const_0xFFFF;

+    vec_x += vec1;

+    vec6 >>= 9;

+    vec7 >>= 9;

+    vec8 >>= 9;

+    vec9 >>= 9;

+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);

+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);

+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);

+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);

+    vec2 += 1;

+    vec3 += 1;

+    vec4 += 1;

+    vec5 += 1;

+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);

+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);

+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);

+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);

+    tmp4 -= tmp0;

+    tmp5 -= tmp1;

+    tmp6 -= tmp2;

+    tmp7 -= tmp3;

+    tmp4 *= vec6;

+    tmp5 *= vec7;

+    tmp6 *= vec8;

+    tmp7 *= vec9;

+    tmp4 += const_0x40;

+    tmp5 += const_0x40;

+    tmp6 += const_0x40;

+    tmp7 += const_0x40;

+    tmp4 >>= 7;

+    tmp5 >>= 7;

+    tmp6 >>= 7;

+    tmp7 >>= 7;

+    tmp0 += tmp4;

+    tmp1 += tmp5;

+    tmp2 += tmp6;

+    tmp3 += tmp7;

+    reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);

+    reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

+    __msa_st_b(dst0, dst_ptr, 0);

+    dst_ptr += 16;

+  }

+}

+void ScaleARGBCols_MSA(uint8_t* dst_argb,

+                       const uint8_t* src_argb,

+                       int dst_width,

+                       int x,

+                       int dx) {

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  uint32_t* dst = (uint32_t*)(dst_argb);

+  int j;

+  v4i32 x_vec = __msa_fill_w(x);

+  v4i32 dx_vec = __msa_fill_w(dx);

+  v4i32 const_vec = {0, 1, 2, 3};

+  v4i32 vec0, vec1, vec2;

+  v4i32 dst0;

+  vec0 = dx_vec * const_vec;

+  vec1 = dx_vec * 4;

+  x_vec += vec0;

+  for (j = 0; j < dst_width; j += 4) {

+    vec2 = x_vec >> 16;

+    x_vec += vec1;

+    LOAD_INDEXED_DATA(src, vec2, dst0);

+    __msa_st_w(dst0, dst, 0);

+    dst += 4;

+  }

+}

+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,

+                             const uint8_t* src_argb,

+                             int dst_width,

+                             int x,

+                             int dx) {

+  const uint32_t* src = (const uint32_t*)(src_argb);

+  int j;

+  v4u32 src0, src1, src2, src3;

+  v4u32 vec0, vec1, vec2, vec3;

+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;

+  v16u8 mult0, mult1, mult2, mult3;

+  v8u16 tmp0, tmp1, tmp2, tmp3;

+  v16u8 dst0, dst1;

+  v4u32 vec_x = (v4u32)__msa_fill_w(x);

+  v4u32 vec_dx = (v4u32)__msa_fill_w(dx);

+  v4u32 vec_const = {0, 1, 2, 3};

+  v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);

+  vec0 = vec_dx * vec_const;

+  vec1 = vec_dx * 4;

+  vec_x += vec0;

+  for (j = 0; j < dst_width - 1; j += 8) {

+    vec2 = vec_x >> 16;

+    reg0 = (v16u8)(vec_x >> 9);

+    vec_x += vec1;

+    vec3 = vec_x >> 16;

+    reg1 = (v16u8)(vec_x >> 9);

+    vec_x += vec1;

+    reg0 = reg0 & const_0x7f;

+    reg1 = reg1 & const_0x7f;

+    reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);

+    reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);

+    reg2 = reg0 ^ const_0x7f;

+    reg3 = reg1 ^ const_0x7f;

+    mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);

+    mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);

+    mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);

+    mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);

+    LOAD_INDEXED_DATA(src, vec2, src0);

+    LOAD_INDEXED_DATA(src, vec3, src1);

+    vec2 += 1;

+    vec3 += 1;

+    LOAD_INDEXED_DATA(src, vec2, src2);

+    LOAD_INDEXED_DATA(src, vec3, src3);

+    reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);

+    reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);

+    reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);

+    reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);

+    tmp0 = __msa_dotp_u_h(reg4, mult0);

+    tmp1 = __msa_dotp_u_h(reg5, mult1);

+    tmp2 = __msa_dotp_u_h(reg6, mult2);

+    tmp3 = __msa_dotp_u_h(reg7, mult3);

+    tmp0 >>= 7;

+    tmp1 >>= 7;

+    tmp2 >>= 7;

+    tmp3 >>= 7;

+    dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);

+    __msa_st_b(dst0, dst_argb, 0);

+    __msa_st_b(dst1, dst_argb, 16);

+    dst_argb += 32;

+  }

+}

+void ScaleRowDown34_MSA(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width) {

+  int x;

+  (void)src_stride;

+  v16u8 src0, src1, src2, src3;

+  v16u8 vec0, vec1, vec2;

+  v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};

+  v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};

+  v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,

+                 21, 23, 24, 25, 27, 28, 29, 31};

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 48) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);

+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);

+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);

+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);

+    __msa_st_b((v16i8)vec0, dst, 0);

+    __msa_st_b((v16i8)vec1, dst, 16);

+    __msa_st_b((v16i8)vec2, dst, 32);

+    src_ptr += 64;

+    dst += 48;

+  }

+}

+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* d,

+                              int dst_width) {

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

+  int x;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;

+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;

+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;

+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;

+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;

+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};

+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};

+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};

+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};

+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,

+                 16, 17, 17, 18, 18, 19, 20, 21};

+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};

+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};

+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};

+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 48) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);

+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);

+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);

+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);

+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);

+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);

+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);

+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);

+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);

+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);

+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);

+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);

+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);

+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);

+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);

+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);

+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);

+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);

+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);

+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);

+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);

+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);

+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);

+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);

+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);

+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);

+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);

+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);

+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);

+    reg0 = __msa_srar_h(reg0, shft0);

+    reg1 = __msa_srar_h(reg1, shft1);

+    reg2 = __msa_srar_h(reg2, shft2);

+    reg3 = __msa_srar_h(reg3, shft0);

+    reg4 = __msa_srar_h(reg4, shft1);

+    reg5 = __msa_srar_h(reg5, shft2);

+    reg6 = __msa_srar_h(reg6, shft0);

+    reg7 = __msa_srar_h(reg7, shft1);

+    reg8 = __msa_srar_h(reg8, shft2);

+    reg9 = __msa_srar_h(reg9, shft0);

+    reg10 = __msa_srar_h(reg10, shft1);

+    reg11 = __msa_srar_h(reg11, shft2);

+    reg0 = reg0 * 3 + reg6;

+    reg1 = reg1 * 3 + reg7;

+    reg2 = reg2 * 3 + reg8;

+    reg3 = reg3 * 3 + reg9;

+    reg4 = reg4 * 3 + reg10;

+    reg5 = reg5 * 3 + reg11;

+    reg0 = __msa_srari_h(reg0, 2);

+    reg1 = __msa_srari_h(reg1, 2);

+    reg2 = __msa_srari_h(reg2, 2);

+    reg3 = __msa_srari_h(reg3, 2);

+    reg4 = __msa_srari_h(reg4, 2);

+    reg5 = __msa_srari_h(reg5, 2);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);

+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);

+    __msa_st_b((v16i8)dst0, d, 0);

+    __msa_st_b((v16i8)dst1, d, 16);

+    __msa_st_b((v16i8)dst2, d, 32);

+    s += 64;

+    t += 64;

+    d += 48;

+  }

+}

+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* d,

+                              int dst_width) {

+  const uint8_t* s = src_ptr;

+  const uint8_t* t = src_ptr + src_stride;

+  int x;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;

+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;

+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;

+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;

+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;

+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};

+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};

+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};

+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};

+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,

+                 16, 17, 17, 18, 18, 19, 20, 21};

+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};

+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};

+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};

+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};

+  assert((dst_width % 3 == 0) && (dst_width > 0));

+  for (x = 0; x < dst_width; x += 48) {

+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);

+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);

+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);

+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);

+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);

+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);

+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);

+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);

+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);

+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);

+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);

+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);

+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);

+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);

+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);

+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);

+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);

+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);

+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);

+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);

+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);

+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);

+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);

+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);

+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);

+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);

+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);

+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);

+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);

+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);

+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);

+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);

+    reg0 = __msa_srar_h(reg0, shft0);

+    reg1 = __msa_srar_h(reg1, shft1);

+    reg2 = __msa_srar_h(reg2, shft2);

+    reg3 = __msa_srar_h(reg3, shft0);

+    reg4 = __msa_srar_h(reg4, shft1);

+    reg5 = __msa_srar_h(reg5, shft2);

+    reg6 = __msa_srar_h(reg6, shft0);

+    reg7 = __msa_srar_h(reg7, shft1);

+    reg8 = __msa_srar_h(reg8, shft2);

+    reg9 = __msa_srar_h(reg9, shft0);

+    reg10 = __msa_srar_h(reg10, shft1);

+    reg11 = __msa_srar_h(reg11, shft2);

+    reg0 += reg6;

+    reg1 += reg7;

+    reg2 += reg8;

+    reg3 += reg9;

+    reg4 += reg10;

+    reg5 += reg11;

+    reg0 = __msa_srari_h(reg0, 1);

+    reg1 = __msa_srari_h(reg1, 1);

+    reg2 = __msa_srari_h(reg2, 1);

+    reg3 = __msa_srari_h(reg3, 1);

+    reg4 = __msa_srari_h(reg4, 1);

+    reg5 = __msa_srari_h(reg5, 1);

+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);

+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);

+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);

+    __msa_st_b((v16i8)dst0, d, 0);

+    __msa_st_b((v16i8)dst1, d, 16);

+    __msa_st_b((v16i8)dst2, d, 32);

+    s += 64;

+    t += 64;

+    d += 48;

+  }

+}

+#ifdef __cplusplus

+}  // extern "C"

+}  // namespace libyuv

+#endif

+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)

--- a/third_party/libyuv/source/scale_neon.cc

+++ b/third_party/libyuv/source/scale_neon.cc

@@ -23,564 +23,541 @@

 // Provided by Fritz Koenig

 // Read 32x1 throw away even pixels, and write 16x1.

-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    // load even pixels into q0, odd into q1

-    MEMACCESS(0)

-    "vld2.8     {q0, q1}, [%0]!                \n"

-    "subs       %2, %2, #16                    \n"  // 16 processed per loop

-    MEMACCESS(1)

-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst),              // %1

-    "+r"(dst_width)         // %2

-  :

-  : "q0", "q1"              // Clobber List

-  );

+void ScaleRowDown2_NEON(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      // load even pixels into q0, odd into q1

+      "vld2.8     {q0, q1}, [%0]!                \n"

+      "subs       %2, %2, #16                    \n"  // 16 processed per loop

+      "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst),       // %1

+        "+r"(dst_width)  // %2

+      :

+      : "q0", "q1"  // Clobber List

+      );

 // Read 32x1 average down and write 16x1.

-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc

-    "subs       %2, %2, #16                    \n"  // 16 processed per loop

-    "vpaddl.u8  q0, q0                         \n"  // add adjacent

-    "vpaddl.u8  q1, q1                         \n"

-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack

-    "vrshrn.u16 d1, q1, #1                     \n"

-    MEMACCESS(1)

-    "vst1.8     {q0}, [%1]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst),              // %1

-    "+r"(dst_width)         // %2

-  :

-  : "q0", "q1"     // Clobber List

-  );

+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst,

+                              int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 32 pixels

+      "subs       %2, %2, #16                    \n"  // 16 processed per loop

+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add

+      "vst1.8     {q0}, [%1]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst),       // %1

+        "+r"(dst_width)  // %2

+      :

+      : "q0", "q1"  // Clobber List

+      );

 // Read 32x2 average down and write 16x1.

-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst, int dst_width) {

-  asm volatile (

-    // change the stride to row 2 pointer

-    "add        %1, %0                         \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc

-    MEMACCESS(1)

-    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc

-    "subs       %3, %3, #16                    \n"  // 16 processed per loop

-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent

-    "vpaddl.u8  q1, q1                         \n"

-    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1

-    "vpadal.u8  q1, q3                         \n"

-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack

-    "vrshrn.u16 d1, q1, #2                     \n"

-    MEMACCESS(2)

-    "vst1.8     {q0}, [%2]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(src_stride),       // %1

-    "+r"(dst),              // %2

-    "+r"(dst_width)         // %3

-  :

-  : "q0", "q1", "q2", "q3"     // Clobber List

-  );

+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst,

+                           int dst_width) {

+  asm volatile(

+      // change the stride to row 2 pointer

+      "add        %1, %0                         \n"

+      "1:                                        \n"

+      "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc

+      "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc

+      "subs       %3, %3, #16                    \n"  // 16 processed per loop

+      "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent

+      "vpaddl.u8  q1, q1                         \n"

+      "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent +

+                                                      // row1

+      "vpadal.u8  q1, q3                         \n"

+      "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and

+                                                      // pack

+      "vrshrn.u16 d1, q1, #2                     \n"

+      "vst1.8     {q0}, [%2]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),     // %0

+        "+r"(src_stride),  // %1

+        "+r"(dst),         // %2

+        "+r"(dst_width)    // %3

+      :

+      : "q0", "q1", "q2", "q3"  // Clobber List

+      );

-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0

-    "subs       %2, %2, #8                     \n" // 8 processed per loop

-    MEMACCESS(1)

-    "vst1.8     {d2}, [%1]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width)         // %2

-  :

-  : "q0", "q1", "memory", "cc"

-  );

+void ScaleRowDown4_NEON(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst_ptr,

+                        int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop

+      "vst1.8     {d2}, [%1]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+      :

+      : "q0", "q1", "memory", "cc");

-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

-  const uint8* src_ptr1 = src_ptr + src_stride;

-  const uint8* src_ptr2 = src_ptr + src_stride * 2;

-  const uint8* src_ptr3 = src_ptr + src_stride * 3;

-asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4

-    MEMACCESS(3)

-    "vld1.8     {q1}, [%3]!                    \n"

-    MEMACCESS(4)

-    "vld1.8     {q2}, [%4]!                    \n"

-    MEMACCESS(5)

-    "vld1.8     {q3}, [%5]!                    \n"

-    "subs       %2, %2, #4                     \n"

-    "vpaddl.u8  q0, q0                         \n"

-    "vpadal.u8  q0, q1                         \n"

-    "vpadal.u8  q0, q2                         \n"

-    "vpadal.u8  q0, q3                         \n"

-    "vpaddl.u16 q0, q0                         \n"

-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding

-    "vmovn.u16  d0, q0                         \n"

-    MEMACCESS(1)

-    "vst1.32    {d0[0]}, [%1]!                 \n"

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),   // %0

-    "+r"(dst_ptr),   // %1

-    "+r"(dst_width), // %2

-    "+r"(src_ptr1),  // %3

-    "+r"(src_ptr2),  // %4

-    "+r"(src_ptr3)   // %5

-  :

-  : "q0", "q1", "q2", "q3", "memory", "cc"

-  );

+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width) {

+  const uint8_t* src_ptr1 = src_ptr + src_stride;

+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;

+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;

+  asm volatile(

+      "1:                                        \n"

+      "vld1.8     {q0}, [%0]!                    \n"  // load up 16x4

+      "vld1.8     {q1}, [%3]!                    \n"

+      "vld1.8     {q2}, [%4]!                    \n"

+      "vld1.8     {q3}, [%5]!                    \n"

+      "subs       %2, %2, #4                     \n"

+      "vpaddl.u8  q0, q0                         \n"

+      "vpadal.u8  q0, q1                         \n"

+      "vpadal.u8  q0, q2                         \n"

+      "vpadal.u8  q0, q3                         \n"

+      "vpaddl.u16 q0, q0                         \n"

+      "vrshrn.u32 d0, q0, #4                     \n"  // divide by 16 w/rounding

+      "vmovn.u16  d0, q0                         \n"

+      "vst1.32    {d0[0]}, [%1]!                 \n"

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),    // %0

+        "+r"(dst_ptr),    // %1

+        "+r"(dst_width),  // %2

+        "+r"(src_ptr1),   // %3

+        "+r"(src_ptr2),   // %4

+        "+r"(src_ptr3)    // %5

+      :

+      : "q0", "q1", "q2", "q3", "memory", "cc");

 // Down scale from 4 to 3 pixels. Use the neon multilane read/write

 // to load up the every 4th pixel into a 4 different registers.

 // Point samples 32 pixels to 24 pixels.

-void ScaleRowDown34_NEON(const uint8* src_ptr,

+void ScaleRowDown34_NEON(const uint8_t* src_ptr,

                          ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0

-    "subs       %2, %2, #24                  \n"

-    "vmov       d2, d3                       \n" // order d0, d1, d2

-    MEMACCESS(1)

-    "vst3.8     {d0, d1, d2}, [%1]!          \n"

-    "bgt        1b                           \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width)         // %2

-  :

-  : "d0", "d1", "d2", "d3", "memory", "cc"

-  );

+                         uint8_t* dst_ptr,

+                         int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0

+      "subs       %2, %2, #24                    \n"

+      "vmov       d2, d3                         \n"  // order d0, d1, d2

+      "vst3.8     {d0, d1, d2}, [%1]!            \n"

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+      :

+      : "d0", "d1", "d2", "d3", "memory", "cc");

-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vmov.u8    d24, #3                        \n"

-    "add        %3, %0                         \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

-    MEMACCESS(3)

-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1

-    "subs         %2, %2, #24                  \n"

+                               uint8_t* dst_ptr,

+                               int dst_width) {

+  asm volatile(

+      "vmov.u8    d24, #3                        \n"

+      "add        %3, %0                         \n"

+      "1:                                        \n"

+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0

+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1

+      "subs         %2, %2, #24                  \n"

-    // filter src line 0 with src line 1

-    // expand chars to shorts to allow for room

-    // when adding lines together

-    "vmovl.u8     q8, d4                       \n"

-    "vmovl.u8     q9, d5                       \n"

-    "vmovl.u8     q10, d6                      \n"

-    "vmovl.u8     q11, d7                      \n"

+      // filter src line 0 with src line 1

+      // expand chars to shorts to allow for room

+      // when adding lines together

+      "vmovl.u8     q8, d4                       \n"

+      "vmovl.u8     q9, d5                       \n"

+      "vmovl.u8     q10, d6                      \n"

+      "vmovl.u8     q11, d7                      \n"

-    // 3 * line_0 + line_1

-    "vmlal.u8     q8, d0, d24                  \n"

-    "vmlal.u8     q9, d1, d24                  \n"

-    "vmlal.u8     q10, d2, d24                 \n"

-    "vmlal.u8     q11, d3, d24                 \n"

+      // 3 * line_0 + line_1

+      "vmlal.u8     q8, d0, d24                  \n"

+      "vmlal.u8     q9, d1, d24                  \n"

+      "vmlal.u8     q10, d2, d24                 \n"

+      "vmlal.u8     q11, d3, d24                 \n"

-    // (3 * line_0 + line_1) >> 2

-    "vqrshrn.u16  d0, q8, #2                   \n"

-    "vqrshrn.u16  d1, q9, #2                   \n"

-    "vqrshrn.u16  d2, q10, #2                  \n"

-    "vqrshrn.u16  d3, q11, #2                  \n"

+      // (3 * line_0 + line_1) >> 2

+      "vqrshrn.u16  d0, q8, #2                   \n"

+      "vqrshrn.u16  d1, q9, #2                   \n"

+      "vqrshrn.u16  d2, q10, #2                  \n"

+      "vqrshrn.u16  d3, q11, #2                  \n"

-    // a0 = (src[0] * 3 + s[1] * 1) >> 2

-    "vmovl.u8     q8, d1                       \n"

-    "vmlal.u8     q8, d0, d24                  \n"

-    "vqrshrn.u16  d0, q8, #2                   \n"

+      // a0 = (src[0] * 3 + s[1] * 1) >> 2

+      "vmovl.u8     q8, d1                       \n"

+      "vmlal.u8     q8, d0, d24                  \n"

+      "vqrshrn.u16  d0, q8, #2                   \n"

-    // a1 = (src[1] * 1 + s[2] * 1) >> 1

-    "vrhadd.u8    d1, d1, d2                   \n"

+      // a1 = (src[1] * 1 + s[2] * 1) >> 1

+      "vrhadd.u8    d1, d1, d2                   \n"

-    // a2 = (src[2] * 1 + s[3] * 3) >> 2

-    "vmovl.u8     q8, d2                       \n"

-    "vmlal.u8     q8, d3, d24                  \n"

-    "vqrshrn.u16  d2, q8, #2                   \n"

+      // a2 = (src[2] * 1 + s[3] * 3) >> 2

+      "vmovl.u8     q8, d2                       \n"

+      "vmlal.u8     q8, d3, d24                  \n"

+      "vqrshrn.u16  d2, q8, #2                   \n"

-    MEMACCESS(1)

-    "vst3.8       {d0, d1, d2}, [%1]!          \n"

+      "vst3.8       {d0, d1, d2}, [%1]!          \n"

-    "bgt          1b                           \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width),        // %2

-    "+r"(src_stride)        // %3

-  :

-  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"

-  );

+      "bgt          1b                           \n"

+      : "+r"(src_ptr),    // %0

+        "+r"(dst_ptr),    // %1

+        "+r"(dst_width),  // %2

+        "+r"(src_stride)  // %3

+      :

+      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",

+        "cc");

-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "vmov.u8    d24, #3                        \n"

-    "add        %3, %0                         \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0

-    MEMACCESS(3)

-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1

-    "subs         %2, %2, #24                  \n"

-    // average src line 0 with src line 1

-    "vrhadd.u8    q0, q0, q2                   \n"

-    "vrhadd.u8    q1, q1, q3                   \n"

+                               uint8_t* dst_ptr,

+                               int dst_width) {

+  asm volatile(

+      "vmov.u8    d24, #3                        \n"

+      "add        %3, %0                         \n"

+      "1:                                        \n"

+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0

+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1

+      "subs         %2, %2, #24                  \n"

+      // average src line 0 with src line 1

+      "vrhadd.u8    q0, q0, q2                   \n"

+      "vrhadd.u8    q1, q1, q3                   \n"

-    // a0 = (src[0] * 3 + s[1] * 1) >> 2

-    "vmovl.u8     q3, d1                       \n"

-    "vmlal.u8     q3, d0, d24                  \n"

-    "vqrshrn.u16  d0, q3, #2                   \n"

+      // a0 = (src[0] * 3 + s[1] * 1) >> 2

+      "vmovl.u8     q3, d1                       \n"

+      "vmlal.u8     q3, d0, d24                  \n"

+      "vqrshrn.u16  d0, q3, #2                   \n"

-    // a1 = (src[1] * 1 + s[2] * 1) >> 1

-    "vrhadd.u8    d1, d1, d2                   \n"

+      // a1 = (src[1] * 1 + s[2] * 1) >> 1

+      "vrhadd.u8    d1, d1, d2                   \n"

-    // a2 = (src[2] * 1 + s[3] * 3) >> 2

-    "vmovl.u8     q3, d2                       \n"

-    "vmlal.u8     q3, d3, d24                  \n"

-    "vqrshrn.u16  d2, q3, #2                   \n"

+      // a2 = (src[2] * 1 + s[3] * 3) >> 2

+      "vmovl.u8     q3, d2                       \n"

+      "vmlal.u8     q3, d3, d24                  \n"

+      "vqrshrn.u16  d2, q3, #2                   \n"

-    MEMACCESS(1)

-    "vst3.8       {d0, d1, d2}, [%1]!          \n"

-    "bgt          1b                           \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width),        // %2

-    "+r"(src_stride)        // %3

-  :

-  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"

-  );

+      "vst3.8       {d0, d1, d2}, [%1]!          \n"

+      "bgt          1b                           \n"

+      : "+r"(src_ptr),    // %0

+        "+r"(dst_ptr),    // %1

+        "+r"(dst_width),  // %2

+        "+r"(src_stride)  // %3

+      :

+      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");

 #define HAS_SCALEROWDOWN38_NEON

-static uvec8 kShuf38 =

-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };

-static uvec8 kShuf38_2 =

-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };

-static vec16 kMult38_Div6 =

-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,

-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };

-static vec16 kMult38_Div9 =

-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,

-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,

+                              22, 24, 27, 30, 0,  0,  0,  0};

+static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,

+                                18, 6, 14, 19, 0,  0,  0, 0};

+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,

+                                   65536 / 12, 65536 / 12, 65536 / 12,

+                                   65536 / 12, 65536 / 12};

+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,

+                                   65536 / 18, 65536 / 18, 65536 / 18,

+                                   65536 / 18, 65536 / 18};

 // 32 -> 12

-void ScaleRowDown38_NEON(const uint8* src_ptr,

+void ScaleRowDown38_NEON(const uint8_t* src_ptr,

                          ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    MEMACCESS(3)

-    "vld1.8     {q3}, [%3]                     \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"

-    "subs       %2, %2, #12                    \n"

-    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"

-    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"

-    MEMACCESS(1)

-    "vst1.8     {d4}, [%1]!                    \n"

-    MEMACCESS(1)

-    "vst1.32    {d5[0]}, [%1]!                 \n"

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width)         // %2

-  : "r"(&kShuf38)           // %3

-  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"

-  );

+                         uint8_t* dst_ptr,

+                         int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "vld1.8     {q3}, [%3]                     \n"

+      "1:                                        \n"

+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"

+      "subs       %2, %2, #12                    \n"

+      "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"

+      "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"

+      "vst1.8     {d4}, [%1]!                    \n"

+      "vst1.32    {d5[0]}, [%1]!                 \n"

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+      : "r"(&kShuf38)    // %3

+      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");

 // 32x3 -> 12x1

-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,

+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,

                                       ptrdiff_t src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  const uint8* src_ptr1 = src_ptr + src_stride * 2;

+                                      uint8_t* dst_ptr,

+                                      int dst_width) {

+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;

-  asm volatile (

-    MEMACCESS(5)

-    "vld1.16    {q13}, [%5]                    \n"

-    MEMACCESS(6)

-    "vld1.8     {q14}, [%6]                    \n"

-    MEMACCESS(7)

-    "vld1.8     {q15}, [%7]                    \n"

-    "add        %3, %0                         \n"

-  "1:                                          \n"

+  asm volatile(

+      "vld1.16    {q13}, [%5]                    \n"

+      "vld1.8     {q14}, [%6]                    \n"

+      "vld1.8     {q15}, [%7]                    \n"

+      "add        %3, %0                         \n"

+      "1:                                        \n"

-    // d0 = 00 40 01 41 02 42 03 43

-    // d1 = 10 50 11 51 12 52 13 53

-    // d2 = 20 60 21 61 22 62 23 63

-    // d3 = 30 70 31 71 32 72 33 73

-    MEMACCESS(0)

-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

-    MEMACCESS(3)

-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

-    MEMACCESS(4)

-    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"

-    "subs         %2, %2, #12                  \n"

+      // d0 = 00 40 01 41 02 42 03 43

+      // d1 = 10 50 11 51 12 52 13 53

+      // d2 = 20 60 21 61 22 62 23 63

+      // d3 = 30 70 31 71 32 72 33 73

+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

+      "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"

+      "subs         %2, %2, #12                  \n"

-    // Shuffle the input data around to get align the data

-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

-    // d0 = 00 10 01 11 02 12 03 13

-    // d1 = 40 50 41 51 42 52 43 53

-    "vtrn.u8      d0, d1                       \n"

-    "vtrn.u8      d4, d5                       \n"

-    "vtrn.u8      d16, d17                     \n"

+      // Shuffle the input data around to get align the data

+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

+      // d0 = 00 10 01 11 02 12 03 13

+      // d1 = 40 50 41 51 42 52 43 53

+      "vtrn.u8      d0, d1                       \n"

+      "vtrn.u8      d4, d5                       \n"

+      "vtrn.u8      d16, d17                     \n"

-    // d2 = 20 30 21 31 22 32 23 33

-    // d3 = 60 70 61 71 62 72 63 73

-    "vtrn.u8      d2, d3                       \n"

-    "vtrn.u8      d6, d7                       \n"

-    "vtrn.u8      d18, d19                     \n"

+      // d2 = 20 30 21 31 22 32 23 33

+      // d3 = 60 70 61 71 62 72 63 73

+      "vtrn.u8      d2, d3                       \n"

+      "vtrn.u8      d6, d7                       \n"

+      "vtrn.u8      d18, d19                     \n"

+      // d0 = 00+10 01+11 02+12 03+13

+      // d2 = 40+50 41+51 42+52 43+53

+      "vpaddl.u8    q0, q0                       \n"

+      "vpaddl.u8    q2, q2                       \n"

+      "vpaddl.u8    q8, q8                       \n"

-    // d0 = 00+10 01+11 02+12 03+13

-    // d2 = 40+50 41+51 42+52 43+53

-    "vpaddl.u8    q0, q0                       \n"

-    "vpaddl.u8    q2, q2                       \n"

-    "vpaddl.u8    q8, q8                       \n"

+      // d3 = 60+70 61+71 62+72 63+73

+      "vpaddl.u8    d3, d3                       \n"

+      "vpaddl.u8    d7, d7                       \n"

+      "vpaddl.u8    d19, d19                     \n"

-    // d3 = 60+70 61+71 62+72 63+73

-    "vpaddl.u8    d3, d3                       \n"

-    "vpaddl.u8    d7, d7                       \n"

-    "vpaddl.u8    d19, d19                     \n"

+      // combine source lines

+      "vadd.u16     q0, q2                       \n"

+      "vadd.u16     q0, q8                       \n"

+      "vadd.u16     d4, d3, d7                   \n"

+      "vadd.u16     d4, d19                      \n"

-    // combine source lines

-    "vadd.u16     q0, q2                       \n"

-    "vadd.u16     q0, q8                       \n"

-    "vadd.u16     d4, d3, d7                   \n"

-    "vadd.u16     d4, d19                      \n"

+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]

+      //             + s[6 + st * 1] + s[7 + st * 1]

+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6

+      "vqrdmulh.s16 q2, q2, q13                  \n"

+      "vmovn.u16    d4, q2                       \n"

-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]

-    //             + s[6 + st * 1] + s[7 + st * 1]

-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6

-    "vqrdmulh.s16 q2, q2, q13                  \n"

-    "vmovn.u16    d4, q2                       \n"

+      // Shuffle 2,3 reg around so that 2 can be added to the

+      //  0,1 reg and 3 can be added to the 4,5 reg. This

+      //  requires expanding from u8 to u16 as the 0,1 and 4,5

+      //  registers are already expanded. Then do transposes

+      //  to get aligned.

+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

+      "vmovl.u8     q1, d2                       \n"

+      "vmovl.u8     q3, d6                       \n"

+      "vmovl.u8     q9, d18                      \n"

-    // Shuffle 2,3 reg around so that 2 can be added to the

-    //  0,1 reg and 3 can be added to the 4,5 reg. This

-    //  requires expanding from u8 to u16 as the 0,1 and 4,5

-    //  registers are already expanded. Then do transposes

-    //  to get aligned.

-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

-    "vmovl.u8     q1, d2                       \n"

-    "vmovl.u8     q3, d6                       \n"

-    "vmovl.u8     q9, d18                      \n"

+      // combine source lines

+      "vadd.u16     q1, q3                       \n"

+      "vadd.u16     q1, q9                       \n"

-    // combine source lines

-    "vadd.u16     q1, q3                       \n"

-    "vadd.u16     q1, q9                       \n"

+      // d4 = xx 20 xx 30 xx 22 xx 32

+      // d5 = xx 21 xx 31 xx 23 xx 33

+      "vtrn.u32     d2, d3                       \n"

-    // d4 = xx 20 xx 30 xx 22 xx 32

-    // d5 = xx 21 xx 31 xx 23 xx 33

-    "vtrn.u32     d2, d3                       \n"

+      // d4 = xx 20 xx 21 xx 22 xx 23

+      // d5 = xx 30 xx 31 xx 32 xx 33

+      "vtrn.u16     d2, d3                       \n"

-    // d4 = xx 20 xx 21 xx 22 xx 23

-    // d5 = xx 30 xx 31 xx 32 xx 33

-    "vtrn.u16     d2, d3                       \n"

+      // 0+1+2, 3+4+5

+      "vadd.u16     q0, q1                       \n"

-    // 0+1+2, 3+4+5

-    "vadd.u16     q0, q1                       \n"

+      // Need to divide, but can't downshift as the the value

+      //  isn't a power of 2. So multiply by 65536 / n

+      //  and take the upper 16 bits.

+      "vqrdmulh.s16 q0, q0, q15                  \n"

-    // Need to divide, but can't downshift as the the value

-    //  isn't a power of 2. So multiply by 65536 / n

-    //  and take the upper 16 bits.

-    "vqrdmulh.s16 q0, q0, q15                  \n"

+      // Align for table lookup, vtbl requires registers to

+      //  be adjacent

+      "vmov.u8      d2, d4                       \n"

-    // Align for table lookup, vtbl requires registers to

-    //  be adjacent

-    "vmov.u8      d2, d4                       \n"

+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

-    MEMACCESS(1)

-    "vst1.8       {d3}, [%1]!                  \n"

-    MEMACCESS(1)

-    "vst1.32      {d4[0]}, [%1]!               \n"

-    "bgt          1b                           \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width),        // %2

-    "+r"(src_stride),       // %3

-    "+r"(src_ptr1)          // %4

-  : "r"(&kMult38_Div6),     // %5

-    "r"(&kShuf38_2),        // %6

-    "r"(&kMult38_Div9)      // %7

-  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"

-  );

+      "vst1.8       {d3}, [%1]!                  \n"

+      "vst1.32      {d4[0]}, [%1]!               \n"

+      "bgt          1b                           \n"

+      : "+r"(src_ptr),       // %0

+        "+r"(dst_ptr),       // %1

+        "+r"(dst_width),     // %2

+        "+r"(src_stride),    // %3

+        "+r"(src_ptr1)       // %4

+      : "r"(&kMult38_Div6),  // %5

+        "r"(&kShuf38_2),     // %6

+        "r"(&kMult38_Div9)   // %7

+      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",

+        "cc");

 // 32x2 -> 12x1

-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    MEMACCESS(4)

-    "vld1.16    {q13}, [%4]                    \n"

-    MEMACCESS(5)

-    "vld1.8     {q14}, [%5]                    \n"

-    "add        %3, %0                         \n"

-  "1:                                          \n"

+                               uint8_t* dst_ptr,

+                               int dst_width) {

+  asm volatile(

+      "vld1.16    {q13}, [%4]                    \n"

+      "vld1.8     {q14}, [%5]                    \n"

+      "add        %3, %0                         \n"

+      "1:                                        \n"

-    // d0 = 00 40 01 41 02 42 03 43

-    // d1 = 10 50 11 51 12 52 13 53

-    // d2 = 20 60 21 61 22 62 23 63

-    // d3 = 30 70 31 71 32 72 33 73

-    MEMACCESS(0)

-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

-    MEMACCESS(3)

-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

-    "subs         %2, %2, #12                  \n"

+      // d0 = 00 40 01 41 02 42 03 43

+      // d1 = 10 50 11 51 12 52 13 53

+      // d2 = 20 60 21 61 22 62 23 63

+      // d3 = 30 70 31 71 32 72 33 73

+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"

+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"

+      "subs         %2, %2, #12                  \n"

-    // Shuffle the input data around to get align the data

-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

-    // d0 = 00 10 01 11 02 12 03 13

-    // d1 = 40 50 41 51 42 52 43 53

-    "vtrn.u8      d0, d1                       \n"

-    "vtrn.u8      d4, d5                       \n"

+      // Shuffle the input data around to get align the data

+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

+      // d0 = 00 10 01 11 02 12 03 13

+      // d1 = 40 50 41 51 42 52 43 53

+      "vtrn.u8      d0, d1                       \n"

+      "vtrn.u8      d4, d5                       \n"

-    // d2 = 20 30 21 31 22 32 23 33

-    // d3 = 60 70 61 71 62 72 63 73

-    "vtrn.u8      d2, d3                       \n"

-    "vtrn.u8      d6, d7                       \n"

+      // d2 = 20 30 21 31 22 32 23 33

+      // d3 = 60 70 61 71 62 72 63 73

+      "vtrn.u8      d2, d3                       \n"

+      "vtrn.u8      d6, d7                       \n"

-    // d0 = 00+10 01+11 02+12 03+13

-    // d2 = 40+50 41+51 42+52 43+53

-    "vpaddl.u8    q0, q0                       \n"

-    "vpaddl.u8    q2, q2                       \n"

+      // d0 = 00+10 01+11 02+12 03+13

+      // d2 = 40+50 41+51 42+52 43+53

+      "vpaddl.u8    q0, q0                       \n"

+      "vpaddl.u8    q2, q2                       \n"

-    // d3 = 60+70 61+71 62+72 63+73

-    "vpaddl.u8    d3, d3                       \n"

-    "vpaddl.u8    d7, d7                       \n"

+      // d3 = 60+70 61+71 62+72 63+73

+      "vpaddl.u8    d3, d3                       \n"

+      "vpaddl.u8    d7, d7                       \n"

-    // combine source lines

-    "vadd.u16     q0, q2                       \n"

-    "vadd.u16     d4, d3, d7                   \n"

+      // combine source lines

+      "vadd.u16     q0, q2                       \n"

+      "vadd.u16     d4, d3, d7                   \n"

-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4

-    "vqrshrn.u16  d4, q2, #2                   \n"

+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4

+      "vqrshrn.u16  d4, q2, #2                   \n"

-    // Shuffle 2,3 reg around so that 2 can be added to the

-    //  0,1 reg and 3 can be added to the 4,5 reg. This

-    //  requires expanding from u8 to u16 as the 0,1 and 4,5

-    //  registers are already expanded. Then do transposes

-    //  to get aligned.

-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

-    "vmovl.u8     q1, d2                       \n"

-    "vmovl.u8     q3, d6                       \n"

+      // Shuffle 2,3 reg around so that 2 can be added to the

+      //  0,1 reg and 3 can be added to the 4,5 reg. This

+      //  requires expanding from u8 to u16 as the 0,1 and 4,5

+      //  registers are already expanded. Then do transposes

+      //  to get aligned.

+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

+      "vmovl.u8     q1, d2                       \n"

+      "vmovl.u8     q3, d6                       \n"

-    // combine source lines

-    "vadd.u16     q1, q3                       \n"

+      // combine source lines

+      "vadd.u16     q1, q3                       \n"

-    // d4 = xx 20 xx 30 xx 22 xx 32

-    // d5 = xx 21 xx 31 xx 23 xx 33

-    "vtrn.u32     d2, d3                       \n"

+      // d4 = xx 20 xx 30 xx 22 xx 32

+      // d5 = xx 21 xx 31 xx 23 xx 33

+      "vtrn.u32     d2, d3                       \n"

-    // d4 = xx 20 xx 21 xx 22 xx 23

-    // d5 = xx 30 xx 31 xx 32 xx 33

-    "vtrn.u16     d2, d3                       \n"

+      // d4 = xx 20 xx 21 xx 22 xx 23

+      // d5 = xx 30 xx 31 xx 32 xx 33

+      "vtrn.u16     d2, d3                       \n"

-    // 0+1+2, 3+4+5

-    "vadd.u16     q0, q1                       \n"

+      // 0+1+2, 3+4+5

+      "vadd.u16     q0, q1                       \n"

-    // Need to divide, but can't downshift as the the value

-    //  isn't a power of 2. So multiply by 65536 / n

-    //  and take the upper 16 bits.

-    "vqrdmulh.s16 q0, q0, q13                  \n"

+      // Need to divide, but can't downshift as the the value

+      //  isn't a power of 2. So multiply by 65536 / n

+      //  and take the upper 16 bits.

+      "vqrdmulh.s16 q0, q0, q13                  \n"

-    // Align for table lookup, vtbl requires registers to

-    //  be adjacent

-    "vmov.u8      d2, d4                       \n"

+      // Align for table lookup, vtbl requires registers to

+      //  be adjacent

+      "vmov.u8      d2, d4                       \n"

-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"

+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"

-    MEMACCESS(1)

-    "vst1.8       {d3}, [%1]!                  \n"

-    MEMACCESS(1)

-    "vst1.32      {d4[0]}, [%1]!               \n"

-    "bgt          1b                           \n"

-  : "+r"(src_ptr),       // %0

-    "+r"(dst_ptr),       // %1

-    "+r"(dst_width),     // %2

-    "+r"(src_stride)     // %3

-  : "r"(&kMult38_Div6),  // %4

-    "r"(&kShuf38_2)      // %5

-  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"

-  );

+      "vst1.8       {d3}, [%1]!                  \n"

+      "vst1.32      {d4[0]}, [%1]!               \n"

+      "bgt          1b                           \n"

+      : "+r"(src_ptr),       // %0

+        "+r"(dst_ptr),       // %1

+        "+r"(dst_width),     // %2

+        "+r"(src_stride)     // %3

+      : "r"(&kMult38_Div6),  // %4

+        "r"(&kShuf38_2)      // %5

+      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");

-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                    uint16* dst_ptr, int src_width, int src_height) {

-  const uint8* src_tmp;

-  asm volatile (

-  "1:                                          \n"

-    "mov       %0, %1                          \n"

-    "mov       r12, %5                         \n"

-    "veor      q2, q2, q2                      \n"

-    "veor      q3, q3, q3                      \n"

-  "2:                                          \n"

-    // load 16 pixels into q0

-    MEMACCESS(0)

-    "vld1.8     {q0}, [%0], %3                 \n"

-    "vaddw.u8   q3, q3, d1                     \n"

-    "vaddw.u8   q2, q2, d0                     \n"

-    "subs       r12, r12, #1                   \n"

-    "bgt        2b                             \n"

-    MEMACCESS(2)

-    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels

-    "add        %1, %1, #16                    \n"

-    "subs       %4, %4, #16                    \n"  // 16 processed per loop

-    "bgt        1b                             \n"

-  : "=&r"(src_tmp),    // %0

-    "+r"(src_ptr),     // %1

-    "+r"(dst_ptr),     // %2

-    "+r"(src_stride),  // %3

-    "+r"(src_width),   // %4

-    "+r"(src_height)   // %5

-  :

-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List

-  );

+void ScaleAddRows_NEON(const uint8_t* src_ptr,

+                       ptrdiff_t src_stride,

+                       uint16_t* dst_ptr,

+                       int src_width,

+                       int src_height) {

+  const uint8_t* src_tmp;

+  asm volatile(

+      "1:                                        \n"

+      "mov       %0, %1                          \n"

+      "mov       r12, %5                         \n"

+      "veor      q2, q2, q2                      \n"

+      "veor      q3, q3, q3                      \n"

+      "2:                                        \n"

+      // load 16 pixels into q0

+      "vld1.8     {q0}, [%0], %3                 \n"

+      "vaddw.u8   q3, q3, d1                     \n"

+      "vaddw.u8   q2, q2, d0                     \n"

+      "subs       r12, r12, #1                   \n"

+      "bgt        2b                             \n"

+      "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels

+      "add        %1, %1, #16                    \n"

+      "subs       %4, %4, #16                    \n"  // 16 processed per loop

+      "bgt        1b                             \n"

+      : "=&r"(src_tmp),    // %0

+        "+r"(src_ptr),     // %1

+        "+r"(dst_ptr),     // %2

+        "+r"(src_stride),  // %3

+        "+r"(src_width),   // %4

+        "+r"(src_height)   // %5

+      :

+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List

+      );

 // TODO(Yang Zhang): Investigate less load instructions for

 // the x/dx stepping

-#define LOAD2_DATA8_LANE(n)                                    \

-    "lsr        %5, %3, #16                    \n"             \

-    "add        %6, %1, %5                     \n"             \

-    "add        %3, %3, %4                     \n"             \

-    MEMACCESS(6)                                               \

-    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"

+#define LOAD2_DATA8_LANE(n)                      \

+  "lsr        %5, %3, #16                    \n" \

+  "add        %6, %1, %5                     \n" \

+  "add        %3, %3, %4                     \n" \

+  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"

-// The NEON version mimics this formula:

-// #define BLENDER(a, b, f) (uint8)((int)(a) +

-//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))

+// The NEON version mimics this formula (from row_common.cc):

+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +

+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))

-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,

-                          int dst_width, int x, int dx) {

+void ScaleFilterCols_NEON(uint8_t* dst_ptr,

+                          const uint8_t* src_ptr,

+                          int dst_width,

+                          int x,

+                          int dx) {

   int dx_offset[4] = {0, 1, 2, 3};

   int* tmp = dx_offset;

-  const uint8* src_tmp = src_ptr;

+  const uint8_t* src_tmp = src_ptr;

   asm volatile (

     "vdup.32    q0, %3                         \n"  // x

     "vdup.32    q1, %4                         \n"  // dx

@@ -617,7 +594,6 @@

     "vadd.s16   q8, q8, q9                     \n"

     "vmovn.s16  d6, q8                         \n"

-    MEMACCESS(0)

     "vst1.8     {d6}, [%0]!                    \n"  // store pixels

     "vadd.s32   q1, q1, q0                     \n"

     "vadd.s32   q2, q2, q0                     \n"

@@ -639,307 +615,279 @@

 #undef LOAD2_DATA8_LANE

 // 16x2 -> 16x1

-void ScaleFilterRows_NEON(uint8* dst_ptr,

-                          const uint8* src_ptr, ptrdiff_t src_stride,

-                          int dst_width, int source_y_fraction) {

-  asm volatile (

-    "cmp          %4, #0                       \n"

-    "beq          100f                         \n"

-    "add          %2, %1                       \n"

-    "cmp          %4, #64                      \n"

-    "beq          75f                          \n"

-    "cmp          %4, #128                     \n"

-    "beq          50f                          \n"

-    "cmp          %4, #192                     \n"

-    "beq          25f                          \n"

+void ScaleFilterRows_NEON(uint8_t* dst_ptr,

+                          const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          int dst_width,

+                          int source_y_fraction) {

+  asm volatile(

+      "cmp          %4, #0                       \n"

+      "beq          100f                         \n"

+      "add          %2, %1                       \n"

+      "cmp          %4, #64                      \n"

+      "beq          75f                          \n"

+      "cmp          %4, #128                     \n"

+      "beq          50f                          \n"

+      "cmp          %4, #192                     \n"

+      "beq          25f                          \n"

-    "vdup.8       d5, %4                       \n"

-    "rsb          %4, #256                     \n"

-    "vdup.8       d4, %4                       \n"

-    // General purpose row blend.

-  "1:                                          \n"

-    MEMACCESS(1)

-    "vld1.8       {q0}, [%1]!                  \n"

-    MEMACCESS(2)

-    "vld1.8       {q1}, [%2]!                  \n"

-    "subs         %3, %3, #16                  \n"

-    "vmull.u8     q13, d0, d4                  \n"

-    "vmull.u8     q14, d1, d4                  \n"

-    "vmlal.u8     q13, d2, d5                  \n"

-    "vmlal.u8     q14, d3, d5                  \n"

-    "vrshrn.u16   d0, q13, #8                  \n"

-    "vrshrn.u16   d1, q14, #8                  \n"

-    MEMACCESS(0)

-    "vst1.8       {q0}, [%0]!                  \n"

-    "bgt          1b                           \n"

-    "b            99f                          \n"

+      "vdup.8       d5, %4                       \n"

+      "rsb          %4, #256                     \n"

+      "vdup.8       d4, %4                       \n"

+      // General purpose row blend.

+      "1:                                        \n"

+      "vld1.8       {q0}, [%1]!                  \n"

+      "vld1.8       {q1}, [%2]!                  \n"

+      "subs         %3, %3, #16                  \n"

+      "vmull.u8     q13, d0, d4                  \n"

+      "vmull.u8     q14, d1, d4                  \n"

+      "vmlal.u8     q13, d2, d5                  \n"

+      "vmlal.u8     q14, d3, d5                  \n"

+      "vrshrn.u16   d0, q13, #8                  \n"

+      "vrshrn.u16   d1, q14, #8                  \n"

+      "vst1.8       {q0}, [%0]!                  \n"

+      "bgt          1b                           \n"

+      "b            99f                          \n"

-    // Blend 25 / 75.

-  "25:                                         \n"

-    MEMACCESS(1)

-    "vld1.8       {q0}, [%1]!                  \n"

-    MEMACCESS(2)

-    "vld1.8       {q1}, [%2]!                  \n"

-    "subs         %3, %3, #16                  \n"

-    "vrhadd.u8    q0, q1                       \n"

-    "vrhadd.u8    q0, q1                       \n"

-    MEMACCESS(0)

-    "vst1.8       {q0}, [%0]!                  \n"

-    "bgt          25b                          \n"

-    "b            99f                          \n"

+      // Blend 25 / 75.

+      "25:                                       \n"

+      "vld1.8       {q0}, [%1]!                  \n"

+      "vld1.8       {q1}, [%2]!                  \n"

+      "subs         %3, %3, #16                  \n"

+      "vrhadd.u8    q0, q1                       \n"

+      "vrhadd.u8    q0, q1                       \n"

+      "vst1.8       {q0}, [%0]!                  \n"

+      "bgt          25b                          \n"

+      "b            99f                          \n"

-    // Blend 50 / 50.

-  "50:                                         \n"

-    MEMACCESS(1)

-    "vld1.8       {q0}, [%1]!                  \n"

-    MEMACCESS(2)

-    "vld1.8       {q1}, [%2]!                  \n"

-    "subs         %3, %3, #16                  \n"

-    "vrhadd.u8    q0, q1                       \n"

-    MEMACCESS(0)

-    "vst1.8       {q0}, [%0]!                  \n"

-    "bgt          50b                          \n"

-    "b            99f                          \n"

+      // Blend 50 / 50.

+      "50:                                       \n"

+      "vld1.8       {q0}, [%1]!                  \n"

+      "vld1.8       {q1}, [%2]!                  \n"

+      "subs         %3, %3, #16                  \n"

+      "vrhadd.u8    q0, q1                       \n"

+      "vst1.8       {q0}, [%0]!                  \n"

+      "bgt          50b                          \n"

+      "b            99f                          \n"

-    // Blend 75 / 25.

-  "75:                                         \n"

-    MEMACCESS(1)

-    "vld1.8       {q1}, [%1]!                  \n"

-    MEMACCESS(2)

-    "vld1.8       {q0}, [%2]!                  \n"

-    "subs         %3, %3, #16                  \n"

-    "vrhadd.u8    q0, q1                       \n"

-    "vrhadd.u8    q0, q1                       \n"

-    MEMACCESS(0)

-    "vst1.8       {q0}, [%0]!                  \n"

-    "bgt          75b                          \n"

-    "b            99f                          \n"

+      // Blend 75 / 25.

+      "75:                                       \n"

+      "vld1.8       {q1}, [%1]!                  \n"

+      "vld1.8       {q0}, [%2]!                  \n"

+      "subs         %3, %3, #16                  \n"

+      "vrhadd.u8    q0, q1                       \n"

+      "vrhadd.u8    q0, q1                       \n"

+      "vst1.8       {q0}, [%0]!                  \n"

+      "bgt          75b                          \n"

+      "b            99f                          \n"

-    // Blend 100 / 0 - Copy row unchanged.

-  "100:                                        \n"

-    MEMACCESS(1)

-    "vld1.8       {q0}, [%1]!                  \n"

-    "subs         %3, %3, #16                  \n"

-    MEMACCESS(0)

-    "vst1.8       {q0}, [%0]!                  \n"

-    "bgt          100b                         \n"

+      // Blend 100 / 0 - Copy row unchanged.

+      "100:                                      \n"

+      "vld1.8       {q0}, [%1]!                  \n"

+      "subs         %3, %3, #16                  \n"

+      "vst1.8       {q0}, [%0]!                  \n"

+      "bgt          100b                         \n"

-  "99:                                         \n"

-    MEMACCESS(0)

-    "vst1.8       {d1[7]}, [%0]                \n"

-  : "+r"(dst_ptr),          // %0

-    "+r"(src_ptr),          // %1

-    "+r"(src_stride),       // %2

-    "+r"(dst_width),        // %3

-    "+r"(source_y_fraction) // %4

-  :

-  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"

-  );

+      "99:                                       \n"

+      "vst1.8       {d1[7]}, [%0]                \n"

+      : "+r"(dst_ptr),           // %0

+        "+r"(src_ptr),           // %1

+        "+r"(src_stride),        // %2

+        "+r"(dst_width),         // %3

+        "+r"(source_y_fraction)  // %4

+      :

+      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");

-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    // load even pixels into q0, odd into q1

-    MEMACCESS(0)

-    "vld2.32    {q0, q1}, [%0]!                \n"

-    MEMACCESS(0)

-    "vld2.32    {q2, q3}, [%0]!                \n"

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop

-    MEMACCESS(1)

-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels

-    MEMACCESS(1)

-    "vst1.8     {q3}, [%1]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst),              // %1

-    "+r"(dst_width)         // %2

-  :

-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List

-  );

+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst,

+                            int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop

+      "vmov       q2, q1                         \n"  // load next 8 ARGB

+      "vst2.32    {q2, q3}, [%1]!                \n"  // store odd pixels

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst),       // %1

+        "+r"(dst_width)  // %2

+      :

+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List

+      );

-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,

-                                  uint8* dst_argb, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(0)

-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop

-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.

-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack

-    "vrshrn.u16 d1, q1, #1                     \n"

-    "vrshrn.u16 d2, q2, #1                     \n"

-    "vrshrn.u16 d3, q3, #1                     \n"

-    MEMACCESS(1)

-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"

-    "bgt       1b                              \n"

-  : "+r"(src_argb),         // %0

-    "+r"(dst_argb),         // %1

-    "+r"(dst_width)         // %2

-  :

-  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List

-  );

+//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!

+//  4a:  3e04        subs  r6, #4

+//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!

+//  50:  ef64 21f4   vorr  q9, q10, q10

+//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!

+//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>

+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_argb,

+                                  int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop

+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add

+      "vrhadd.u8  q1, q2, q3                     \n"  // rounding half add

+      "vst2.32    {q0, q1}, [%1]!                \n"

+      "bgt       1b                              \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(dst_width)  // %2

+      :

+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List

+      );

-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width) {

-  asm volatile (

-    // change the stride to row 2 pointer

-    "add        %1, %1, %0                     \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

-    MEMACCESS(0)

-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.

-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.

-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.

-    MEMACCESS(1)

-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.

-    MEMACCESS(1)

-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.

-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.

-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.

-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.

-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.

-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack

-    "vrshrn.u16 d1, q1, #2                     \n"

-    "vrshrn.u16 d2, q2, #2                     \n"

-    "vrshrn.u16 d3, q3, #2                     \n"

-    MEMACCESS(2)

-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"

-    "bgt        1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(src_stride),       // %1

-    "+r"(dst),              // %2

-    "+r"(dst_width)         // %3

-  :

-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"

-  );

+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst,

+                               int dst_width) {

+  asm volatile(

+      // change the stride to row 2 pointer

+      "add        %1, %1, %0                     \n"

+      "1:                                        \n"

+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.

+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB

+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.

+      "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.

+      "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.

+      "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.

+      "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.

+      "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB

+      "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB

+      "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.

+      "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.

+      "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.

+      "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.

+      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes

+      "vrshrn.u16 d1, q1, #2                     \n"

+      "vrshrn.u16 d2, q2, #2                     \n"

+      "vrshrn.u16 d3, q3, #2                     \n"

+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"

+      "bgt        1b                             \n"

+      : "+r"(src_ptr),     // %0

+        "+r"(src_stride),  // %1

+        "+r"(dst),         // %2

+        "+r"(dst_width)    // %3

+      :

+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");

 // Reads 4 pixels at a time.

 // Alignment requirement: src_argb 4 byte aligned.

-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,

-                               int src_stepx, uint8* dst_argb, int dst_width) {

-  asm volatile (

-    "mov        r12, %3, lsl #2                \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.32    {d0[0]}, [%0], r12             \n"

-    MEMACCESS(0)

-    "vld1.32    {d0[1]}, [%0], r12             \n"

-    MEMACCESS(0)

-    "vld1.32    {d1[0]}, [%0], r12             \n"

-    MEMACCESS(0)

-    "vld1.32    {d1[1]}, [%0], r12             \n"

-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.

-    MEMACCESS(1)

-    "vst1.8     {q0}, [%1]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+r"(dst_width)    // %2

-  : "r"(src_stepx)     // %3

-  : "memory", "cc", "r12", "q0"

-  );

+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,

+                               ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8_t* dst_argb,

+                               int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "mov        r12, %3, lsl #2                \n"

+      "1:                                        \n"

+      "vld1.32    {d0[0]}, [%0], r12             \n"

+      "vld1.32    {d0[1]}, [%0], r12             \n"

+      "vld1.32    {d1[0]}, [%0], r12             \n"

+      "vld1.32    {d1[1]}, [%0], r12             \n"

+      "subs       %2, %2, #4                     \n"  // 4 pixels per loop.

+      "vst1.8     {q0}, [%1]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(dst_width)  // %2

+      : "r"(src_stepx)   // %3

+      : "memory", "cc", "r12", "q0");

 // Reads 4 pixels at a time.

 // Alignment requirement: src_argb 4 byte aligned.

-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,

+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

                                   int src_stepx,

-                                  uint8* dst_argb, int dst_width) {

-  asm volatile (

-    "mov        r12, %4, lsl #2                \n"

-    "add        %1, %1, %0                     \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1

-    MEMACCESS(1)

-    "vld1.8     {d1}, [%1], r12                \n"

-    MEMACCESS(0)

-    "vld1.8     {d2}, [%0], r12                \n"

-    MEMACCESS(1)

-    "vld1.8     {d3}, [%1], r12                \n"

-    MEMACCESS(0)

-    "vld1.8     {d4}, [%0], r12                \n"

-    MEMACCESS(1)

-    "vld1.8     {d5}, [%1], r12                \n"

-    MEMACCESS(0)

-    "vld1.8     {d6}, [%0], r12                \n"

-    MEMACCESS(1)

-    "vld1.8     {d7}, [%1], r12                \n"

-    "vaddl.u8   q0, d0, d1                     \n"

-    "vaddl.u8   q1, d2, d3                     \n"

-    "vaddl.u8   q2, d4, d5                     \n"

-    "vaddl.u8   q3, d6, d7                     \n"

-    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd

-    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh

-    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)

-    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)

-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.

-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.

-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.

-    MEMACCESS(2)

-    "vst1.8     {q0}, [%2]!                    \n"

-    "bgt        1b                             \n"

-  : "+r"(src_argb),    // %0

-    "+r"(src_stride),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(dst_width)    // %3

-  : "r"(src_stepx)     // %4

-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"

-  );

+                                  uint8_t* dst_argb,

+                                  int dst_width) {

+  asm volatile(

+      "mov        r12, %4, lsl #2                \n"

+      "add        %1, %1, %0                     \n"

+      "1:                                        \n"

+      "vld1.8     {d0}, [%0], r12                \n"  // 4 2x2 blocks -> 2x1

+      "vld1.8     {d1}, [%1], r12                \n"

+      "vld1.8     {d2}, [%0], r12                \n"

+      "vld1.8     {d3}, [%1], r12                \n"

+      "vld1.8     {d4}, [%0], r12                \n"

+      "vld1.8     {d5}, [%1], r12                \n"

+      "vld1.8     {d6}, [%0], r12                \n"

+      "vld1.8     {d7}, [%1], r12                \n"

+      "vaddl.u8   q0, d0, d1                     \n"

+      "vaddl.u8   q1, d2, d3                     \n"

+      "vaddl.u8   q2, d4, d5                     \n"

+      "vaddl.u8   q3, d6, d7                     \n"

+      "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd

+      "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh

+      "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)

+      "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)

+      "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.

+      "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.

+      "subs       %3, %3, #4                     \n"  // 4 pixels per loop.

+      "vst1.8     {q0}, [%2]!                    \n"

+      "bgt        1b                             \n"

+      : "+r"(src_argb),    // %0

+        "+r"(src_stride),  // %1

+        "+r"(dst_argb),    // %2

+        "+r"(dst_width)    // %3

+      : "r"(src_stepx)     // %4

+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");

 // TODO(Yang Zhang): Investigate less load instructions for

 // the x/dx stepping

-#define LOAD1_DATA32_LANE(dn, n)                               \

-    "lsr        %5, %3, #16                    \n"             \

-    "add        %6, %1, %5, lsl #2             \n"             \

-    "add        %3, %3, %4                     \n"             \

-    MEMACCESS(6)                                               \

-    "vld1.32    {"#dn"["#n"]}, [%6]            \n"

+#define LOAD1_DATA32_LANE(dn, n)                 \

+  "lsr        %5, %3, #16                    \n" \

+  "add        %6, %1, %5, lsl #2             \n" \

+  "add        %3, %3, %4                     \n" \

+  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"

-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int x, int dx) {

+void ScaleARGBCols_NEON(uint8_t* dst_argb,

+                        const uint8_t* src_argb,

+                        int dst_width,

+                        int x,

+                        int dx) {

   int tmp;

-  const uint8* src_tmp = src_argb;

-  asm volatile (

-  "1:                                          \n"

-    LOAD1_DATA32_LANE(d0, 0)

-    LOAD1_DATA32_LANE(d0, 1)

-    LOAD1_DATA32_LANE(d1, 0)

-    LOAD1_DATA32_LANE(d1, 1)

-    LOAD1_DATA32_LANE(d2, 0)

-    LOAD1_DATA32_LANE(d2, 1)

-    LOAD1_DATA32_LANE(d3, 0)

-    LOAD1_DATA32_LANE(d3, 1)

-    MEMACCESS(0)

-    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels

-    "subs       %2, %2, #8                     \n"  // 8 processed per loop

-    "bgt        1b                             \n"

-  : "+r"(dst_argb),   // %0

-    "+r"(src_argb),   // %1

-    "+r"(dst_width),  // %2

-    "+r"(x),          // %3

-    "+r"(dx),         // %4

-    "=&r"(tmp),       // %5

-    "+r"(src_tmp)     // %6

-  :

-  : "memory", "cc", "q0", "q1"

-  );

+  const uint8_t* src_tmp = src_argb;

+  asm volatile(

+      "1:                                        \n"

+      // clang-format off

+      LOAD1_DATA32_LANE(d0, 0)

+      LOAD1_DATA32_LANE(d0, 1)

+      LOAD1_DATA32_LANE(d1, 0)

+      LOAD1_DATA32_LANE(d1, 1)

+      LOAD1_DATA32_LANE(d2, 0)

+      LOAD1_DATA32_LANE(d2, 1)

+      LOAD1_DATA32_LANE(d3, 0)

+      LOAD1_DATA32_LANE(d3, 1)

+      // clang-format on

+      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels

+      "subs       %2, %2, #8                     \n"  // 8 processed per loop

+      "bgt        1b                             \n"

+      : "+r"(dst_argb),   // %0

+        "+r"(src_argb),   // %1

+        "+r"(dst_width),  // %2

+        "+r"(x),          // %3

+        "+r"(dx),         // %4

+        "=&r"(tmp),       // %5

+        "+r"(src_tmp)     // %6

+      :

+      : "memory", "cc", "q0", "q1");

 #undef LOAD1_DATA32_LANE

@@ -946,18 +894,20 @@

 // TODO(Yang Zhang): Investigate less load instructions for

 // the x/dx stepping

-#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \

-    "lsr        %5, %3, #16                           \n"      \

-    "add        %6, %1, %5, lsl #2                    \n"      \

-    "add        %3, %3, %4                            \n"      \

-    MEMACCESS(6)                                               \

-    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"

+#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \

+  "lsr        %5, %3, #16                                \n" \

+  "add        %6, %1, %5, lsl #2                         \n" \

+  "add        %3, %3, %4                                 \n" \

+  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"

-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,

-                              int dst_width, int x, int dx) {

+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,

+                              const uint8_t* src_argb,

+                              int dst_width,

+                              int x,

+                              int dx) {

   int dx_offset[4] = {0, 1, 2, 3};

   int* tmp = dx_offset;

-  const uint8* src_tmp = src_argb;

+  const uint8_t* src_tmp = src_argb;

   asm volatile (

     "vdup.32    q0, %3                         \n"  // x

     "vdup.32    q1, %4                         \n"  // dx

@@ -993,7 +943,6 @@

     "vshrn.i16   d0, q11, #7                   \n"

     "vshrn.i16   d1, q12, #7                   \n"

-    MEMACCESS(0)

     "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels

     "vadd.s32    q8, q8, q9                    \n"

     "subs        %2, %2, #4                    \n"  // 4 processed per loop

--- a/third_party/libyuv/source/scale_neon64.cc

+++ b/third_party/libyuv/source/scale_neon64.cc

@@ -8,8 +8,8 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "libyuv/scale.h"

 #include "libyuv/row.h"

+#include "libyuv/scale.h"

 #include "libyuv/scale_row.h"

 #ifdef __cplusplus

@@ -21,580 +21,556 @@

 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 // Read 32x1 throw away even pixels, and write 16x1.

-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    // load even pixels into v0, odd into v1

-    MEMACCESS(0)

-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"

-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop

-    MEMACCESS(1)

-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels

-    "b.gt       1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst),              // %1

-    "+r"(dst_width)         // %2

-  :

-  : "v0", "v1"              // Clobber List

-  );

+void ScaleRowDown2_NEON(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst,

+                        int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      // load even pixels into v0, odd into v1

+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"

+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop

+      "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst),       // %1

+        "+r"(dst_width)  // %2

+      :

+      : "v0", "v1"  // Clobber List

+      );

 // Read 32x1 average down and write 16x1.

-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc

-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop

-    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent

-    "uaddlp     v1.8h, v1.16b                  \n"

-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack

-    "rshrn2     v0.16b, v1.8h, #1              \n"

-    MEMACCESS(1)

-    "st1        {v0.16b}, [%1], #16            \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst),              // %1

-    "+r"(dst_width)         // %2

-  :

-  : "v0", "v1"     // Clobber List

-  );

+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint8_t* dst,

+                              int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      // load even pixels into v0, odd into v1

+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"

+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop

+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add

+      "st1        {v0.16b}, [%1], #16            \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst),       // %1

+        "+r"(dst_width)  // %2

+      :

+      : "v0", "v1"  // Clobber List

+      );

 // Read 32x2 average down and write 16x1.

-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst, int dst_width) {

-  asm volatile (

-    // change the stride to row 2 pointer

-    "add        %1, %1, %0                     \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc

-    MEMACCESS(1)

-    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc

-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop

-    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent

-    "uaddlp     v1.8h, v1.16b                  \n"

-    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1

-    "uadalp     v1.8h, v3.16b                  \n"

-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack

-    "rshrn2     v0.16b, v1.8h, #2              \n"

-    MEMACCESS(2)

-    "st1        {v0.16b}, [%2], #16            \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(src_stride),       // %1

-    "+r"(dst),              // %2

-    "+r"(dst_width)         // %3

-  :

-  : "v0", "v1", "v2", "v3"     // Clobber List

-  );

+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst,

+                           int dst_width) {

+  asm volatile(

+      // change the stride to row 2 pointer

+      "add        %1, %1, %0                     \n"

+      "1:                                        \n"

+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc

+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc

+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop

+      "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent

+      "uaddlp     v1.8h, v1.16b                  \n"

+      "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent

+      "uadalp     v1.8h, v3.16b                  \n"

+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack

+      "rshrn2     v0.16b, v1.8h, #2              \n"

+      "st1        {v0.16b}, [%2], #16            \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),     // %0

+        "+r"(src_stride),  // %1

+        "+r"(dst),         // %2

+        "+r"(dst_width)    // %3

+      :

+      : "v0", "v1", "v2", "v3"  // Clobber List

+      );

-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

-    MEMACCESS(1)

-    "st1     {v2.8b}, [%1], #8                 \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width)         // %2

-  :

-  : "v0", "v1", "v2", "v3", "memory", "cc"

-  );

+void ScaleRowDown4_NEON(const uint8_t* src_ptr,

+                        ptrdiff_t src_stride,

+                        uint8_t* dst_ptr,

+                        int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

+      "st1     {v2.8b}, [%1], #8                 \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+      :

+      : "v0", "v1", "v2", "v3", "memory", "cc");

-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

-  const uint8* src_ptr1 = src_ptr + src_stride;

-  const uint8* src_ptr2 = src_ptr + src_stride * 2;

-  const uint8* src_ptr3 = src_ptr + src_stride * 3;

-asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4

-    MEMACCESS(3)

-    "ld1     {v1.16b}, [%2], #16               \n"

-    MEMACCESS(4)

-    "ld1     {v2.16b}, [%3], #16               \n"

-    MEMACCESS(5)

-    "ld1     {v3.16b}, [%4], #16               \n"

-    "subs    %w5, %w5, #4                      \n"

-    "uaddlp  v0.8h, v0.16b                     \n"

-    "uadalp  v0.8h, v1.16b                     \n"

-    "uadalp  v0.8h, v2.16b                     \n"

-    "uadalp  v0.8h, v3.16b                     \n"

-    "addp    v0.8h, v0.8h, v0.8h               \n"

-    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding

-    MEMACCESS(1)

-    "st1    {v0.s}[0], [%1], #4                \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_ptr),   // %0

-    "+r"(dst_ptr),   // %1

-    "+r"(src_ptr1),  // %2

-    "+r"(src_ptr2),  // %3

-    "+r"(src_ptr3),  // %4

-    "+r"(dst_width)  // %5

-  :

-  : "v0", "v1", "v2", "v3", "memory", "cc"

-  );

+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,

+                           ptrdiff_t src_stride,

+                           uint8_t* dst_ptr,

+                           int dst_width) {

+  const uint8_t* src_ptr1 = src_ptr + src_stride;

+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;

+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;

+  asm volatile(

+      "1:                                        \n"

+      "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4

+      "ld1     {v1.16b}, [%2], #16               \n"

+      "ld1     {v2.16b}, [%3], #16               \n"

+      "ld1     {v3.16b}, [%4], #16               \n"

+      "subs    %w5, %w5, #4                      \n"

+      "uaddlp  v0.8h, v0.16b                     \n"

+      "uadalp  v0.8h, v1.16b                     \n"

+      "uadalp  v0.8h, v2.16b                     \n"

+      "uadalp  v0.8h, v3.16b                     \n"

+      "addp    v0.8h, v0.8h, v0.8h               \n"

+      "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding

+      "st1    {v0.s}[0], [%1], #4                \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(src_ptr1),  // %2

+        "+r"(src_ptr2),  // %3

+        "+r"(src_ptr3),  // %4

+        "+r"(dst_width)  // %5

+      :

+      : "v0", "v1", "v2", "v3", "memory", "cc");

 // Down scale from 4 to 3 pixels. Use the neon multilane read/write

 // to load up the every 4th pixel into a 4 different registers.

 // Point samples 32 pixels to 24 pixels.

-void ScaleRowDown34_NEON(const uint8* src_ptr,

+void ScaleRowDown34_NEON(const uint8_t* src_ptr,

                          ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width) {

-  asm volatile (

-  "1:                                                  \n"

-    MEMACCESS(0)

-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0

-    "subs      %w2, %w2, #24                           \n"

-    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2

-    MEMACCESS(1)

-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"

-    "b.gt      1b                                      \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width)         // %2

-  :

-  : "v0", "v1", "v2", "v3", "memory", "cc"

-  );

+                         uint8_t* dst_ptr,

+                         int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                                \n"

+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0

+      "subs      %w2, %w2, #24                           \n"

+      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2

+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"

+      "b.gt      1b                                      \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+      :

+      : "v0", "v1", "v2", "v3", "memory", "cc");

-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movi      v20.8b, #3                              \n"

-    "add       %3, %3, %0                              \n"

-  "1:                                                  \n"

-    MEMACCESS(0)

-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0

-    MEMACCESS(3)

-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1

-    "subs         %w2, %w2, #24                        \n"

+                               uint8_t* dst_ptr,

+                               int dst_width) {

+  asm volatile(

+      "movi      v20.8b, #3                              \n"

+      "add       %3, %3, %0                              \n"

+      "1:                                                \n"

+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0

+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1

+      "subs         %w2, %w2, #24                        \n"

-    // filter src line 0 with src line 1

-    // expand chars to shorts to allow for room

-    // when adding lines together

-    "ushll     v16.8h, v4.8b, #0                       \n"

-    "ushll     v17.8h, v5.8b, #0                       \n"

-    "ushll     v18.8h, v6.8b, #0                       \n"

-    "ushll     v19.8h, v7.8b, #0                       \n"

+      // filter src line 0 with src line 1

+      // expand chars to shorts to allow for room

+      // when adding lines together

+      "ushll     v16.8h, v4.8b, #0                       \n"

+      "ushll     v17.8h, v5.8b, #0                       \n"

+      "ushll     v18.8h, v6.8b, #0                       \n"

+      "ushll     v19.8h, v7.8b, #0                       \n"

-    // 3 * line_0 + line_1

-    "umlal     v16.8h, v0.8b, v20.8b                   \n"

-    "umlal     v17.8h, v1.8b, v20.8b                   \n"

-    "umlal     v18.8h, v2.8b, v20.8b                   \n"

-    "umlal     v19.8h, v3.8b, v20.8b                   \n"

+      // 3 * line_0 + line_1

+      "umlal     v16.8h, v0.8b, v20.8b                   \n"

+      "umlal     v17.8h, v1.8b, v20.8b                   \n"

+      "umlal     v18.8h, v2.8b, v20.8b                   \n"

+      "umlal     v19.8h, v3.8b, v20.8b                   \n"

-    // (3 * line_0 + line_1) >> 2

-    "uqrshrn   v0.8b, v16.8h, #2                       \n"

-    "uqrshrn   v1.8b, v17.8h, #2                       \n"

-    "uqrshrn   v2.8b, v18.8h, #2                       \n"

-    "uqrshrn   v3.8b, v19.8h, #2                       \n"

+      // (3 * line_0 + line_1) >> 2

+      "uqrshrn   v0.8b, v16.8h, #2                       \n"

+      "uqrshrn   v1.8b, v17.8h, #2                       \n"

+      "uqrshrn   v2.8b, v18.8h, #2                       \n"

+      "uqrshrn   v3.8b, v19.8h, #2                       \n"

-    // a0 = (src[0] * 3 + s[1] * 1) >> 2

-    "ushll     v16.8h, v1.8b, #0                       \n"

-    "umlal     v16.8h, v0.8b, v20.8b                   \n"

-    "uqrshrn   v0.8b, v16.8h, #2                       \n"

+      // a0 = (src[0] * 3 + s[1] * 1) >> 2

+      "ushll     v16.8h, v1.8b, #0                       \n"

+      "umlal     v16.8h, v0.8b, v20.8b                   \n"

+      "uqrshrn   v0.8b, v16.8h, #2                       \n"

-    // a1 = (src[1] * 1 + s[2] * 1) >> 1

-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"

+      // a1 = (src[1] * 1 + s[2] * 1) >> 1

+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"

-    // a2 = (src[2] * 1 + s[3] * 3) >> 2

-    "ushll     v16.8h, v2.8b, #0                       \n"

-    "umlal     v16.8h, v3.8b, v20.8b                   \n"

-    "uqrshrn   v2.8b, v16.8h, #2                       \n"

+      // a2 = (src[2] * 1 + s[3] * 3) >> 2

+      "ushll     v16.8h, v2.8b, #0                       \n"

+      "umlal     v16.8h, v3.8b, v20.8b                   \n"

+      "uqrshrn   v2.8b, v16.8h, #2                       \n"

-    MEMACCESS(1)

-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"

+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"

-    "b.gt      1b                                      \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width),        // %2

-    "+r"(src_stride)        // %3

-  :

-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",

-    "v20", "memory", "cc"

-  );

+      "b.gt      1b                                      \n"

+      : "+r"(src_ptr),    // %0

+        "+r"(dst_ptr),    // %1

+        "+r"(dst_width),  // %2

+        "+r"(src_stride)  // %3

+      :

+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",

+        "v19", "v20", "memory", "cc");

-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    "movi      v20.8b, #3                              \n"

-    "add       %3, %3, %0                              \n"

-  "1:                                                  \n"

-    MEMACCESS(0)

-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0

-    MEMACCESS(3)

-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1

-    "subs         %w2, %w2, #24                        \n"

-    // average src line 0 with src line 1

-    "urhadd    v0.8b, v0.8b, v4.8b                     \n"

-    "urhadd    v1.8b, v1.8b, v5.8b                     \n"

-    "urhadd    v2.8b, v2.8b, v6.8b                     \n"

-    "urhadd    v3.8b, v3.8b, v7.8b                     \n"

+                               uint8_t* dst_ptr,

+                               int dst_width) {

+  asm volatile(

+      "movi      v20.8b, #3                              \n"

+      "add       %3, %3, %0                              \n"

+      "1:                                                \n"

+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0

+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1

+      "subs         %w2, %w2, #24                        \n"

+      // average src line 0 with src line 1

+      "urhadd    v0.8b, v0.8b, v4.8b                     \n"

+      "urhadd    v1.8b, v1.8b, v5.8b                     \n"

+      "urhadd    v2.8b, v2.8b, v6.8b                     \n"

+      "urhadd    v3.8b, v3.8b, v7.8b                     \n"

-    // a0 = (src[0] * 3 + s[1] * 1) >> 2

-    "ushll     v4.8h, v1.8b, #0                        \n"

-    "umlal     v4.8h, v0.8b, v20.8b                    \n"

-    "uqrshrn   v0.8b, v4.8h, #2                        \n"

+      // a0 = (src[0] * 3 + s[1] * 1) >> 2

+      "ushll     v4.8h, v1.8b, #0                        \n"

+      "umlal     v4.8h, v0.8b, v20.8b                    \n"

+      "uqrshrn   v0.8b, v4.8h, #2                        \n"

-    // a1 = (src[1] * 1 + s[2] * 1) >> 1

-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"

+      // a1 = (src[1] * 1 + s[2] * 1) >> 1

+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"

-    // a2 = (src[2] * 1 + s[3] * 3) >> 2

-    "ushll     v4.8h, v2.8b, #0                        \n"

-    "umlal     v4.8h, v3.8b, v20.8b                    \n"

-    "uqrshrn   v2.8b, v4.8h, #2                        \n"

+      // a2 = (src[2] * 1 + s[3] * 3) >> 2

+      "ushll     v4.8h, v2.8b, #0                        \n"

+      "umlal     v4.8h, v3.8b, v20.8b                    \n"

+      "uqrshrn   v2.8b, v4.8h, #2                        \n"

-    MEMACCESS(1)

-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"

-    "b.gt      1b                                      \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width),        // %2

-    "+r"(src_stride)        // %3

-  :

-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"

-  );

+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"

+      "b.gt      1b                                      \n"

+      : "+r"(src_ptr),    // %0

+        "+r"(dst_ptr),    // %1

+        "+r"(dst_width),  // %2

+        "+r"(src_stride)  // %3

+      :

+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");

-static uvec8 kShuf38 =

-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };

-static uvec8 kShuf38_2 =

-  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };

-static vec16 kMult38_Div6 =

-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,

-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };

-static vec16 kMult38_Div9 =

-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,

-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };

+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,

+                              22, 24, 27, 30, 0,  0,  0,  0};

+static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,

+                                34, 6,  22, 35, 0,  0,  0, 0};

+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,

+                                   65536 / 12, 65536 / 12, 65536 / 12,

+                                   65536 / 12, 65536 / 12};

+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,

+                                   65536 / 18, 65536 / 18, 65536 / 18,

+                                   65536 / 18, 65536 / 18};

 // 32 -> 12

-void ScaleRowDown38_NEON(const uint8* src_ptr,

+void ScaleRowDown38_NEON(const uint8_t* src_ptr,

                          ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width) {

-  asm volatile (

-    MEMACCESS(3)

-    "ld1       {v3.16b}, [%3]                          \n"

-  "1:                                                  \n"

-    MEMACCESS(0)

-    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"

-    "subs      %w2, %w2, #12                           \n"

-    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"

-    MEMACCESS(1)

-    "st1       {v2.8b}, [%1], #8                       \n"

-    MEMACCESS(1)

-    "st1       {v2.s}[2], [%1], #4                     \n"

-    "b.gt      1b                                      \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(dst_width)         // %2

-  : "r"(&kShuf38)           // %3

-  : "v0", "v1", "v2", "v3", "memory", "cc"

-  );

+                         uint8_t* dst_ptr,

+                         int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "ld1       {v3.16b}, [%3]                          \n"

+      "1:                                                \n"

+      "ld1       {v0.16b,v1.16b}, [%0], #32              \n"

+      "subs      %w2, %w2, #12                           \n"

+      "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"

+      "st1       {v2.8b}, [%1], #8                       \n"

+      "st1       {v2.s}[2], [%1], #4                     \n"

+      "b.gt      1b                                      \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst_ptr),   // %1

+        "+r"(dst_width)  // %2

+      : "r"(&kShuf38)    // %3

+      : "v0", "v1", "v2", "v3", "memory", "cc");

 // 32x3 -> 12x1

-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,

+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,

                                       ptrdiff_t src_stride,

-                                      uint8* dst_ptr, int dst_width) {

-  const uint8* src_ptr1 = src_ptr + src_stride * 2;

+                                      uint8_t* dst_ptr,

+                                      int dst_width) {

+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;

   ptrdiff_t tmp_src_stride = src_stride;

-  asm volatile (

-    MEMACCESS(5)

-    "ld1       {v29.8h}, [%5]                          \n"

-    MEMACCESS(6)

-    "ld1       {v30.16b}, [%6]                         \n"

-    MEMACCESS(7)

-    "ld1       {v31.8h}, [%7]                          \n"

-    "add       %2, %2, %0                              \n"

-  "1:                                                  \n"

+  asm volatile(

+      "ld1       {v29.8h}, [%5]                          \n"

+      "ld1       {v30.16b}, [%6]                         \n"

+      "ld1       {v31.8h}, [%7]                          \n"

+      "add       %2, %2, %0                              \n"

+      "1:                                                \n"

-    // 00 40 01 41 02 42 03 43

-    // 10 50 11 51 12 52 13 53

-    // 20 60 21 61 22 62 23 63

-    // 30 70 31 71 32 72 33 73

-    MEMACCESS(0)

-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"

-    MEMACCESS(3)

-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"

-    MEMACCESS(4)

-    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"

-    "subs      %w4, %w4, #12                           \n"

+      // 00 40 01 41 02 42 03 43

+      // 10 50 11 51 12 52 13 53

+      // 20 60 21 61 22 62 23 63

+      // 30 70 31 71 32 72 33 73

+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"

+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"

+      "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"

+      "subs      %w4, %w4, #12                           \n"

-    // Shuffle the input data around to get align the data

-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

-    // 00 10 01 11 02 12 03 13

-    // 40 50 41 51 42 52 43 53

-    "trn1      v20.8b, v0.8b, v1.8b                    \n"

-    "trn2      v21.8b, v0.8b, v1.8b                    \n"

-    "trn1      v22.8b, v4.8b, v5.8b                    \n"

-    "trn2      v23.8b, v4.8b, v5.8b                    \n"

-    "trn1      v24.8b, v16.8b, v17.8b                  \n"

-    "trn2      v25.8b, v16.8b, v17.8b                  \n"

+      // Shuffle the input data around to get align the data

+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

+      // 00 10 01 11 02 12 03 13

+      // 40 50 41 51 42 52 43 53

+      "trn1      v20.8b, v0.8b, v1.8b                    \n"

+      "trn2      v21.8b, v0.8b, v1.8b                    \n"

+      "trn1      v22.8b, v4.8b, v5.8b                    \n"

+      "trn2      v23.8b, v4.8b, v5.8b                    \n"

+      "trn1      v24.8b, v16.8b, v17.8b                  \n"

+      "trn2      v25.8b, v16.8b, v17.8b                  \n"

-    // 20 30 21 31 22 32 23 33

-    // 60 70 61 71 62 72 63 73

-    "trn1      v0.8b, v2.8b, v3.8b                     \n"

-    "trn2      v1.8b, v2.8b, v3.8b                     \n"

-    "trn1      v4.8b, v6.8b, v7.8b                     \n"

-    "trn2      v5.8b, v6.8b, v7.8b                     \n"

-    "trn1      v16.8b, v18.8b, v19.8b                  \n"

-    "trn2      v17.8b, v18.8b, v19.8b                  \n"

+      // 20 30 21 31 22 32 23 33

+      // 60 70 61 71 62 72 63 73

+      "trn1      v0.8b, v2.8b, v3.8b                     \n"

+      "trn2      v1.8b, v2.8b, v3.8b                     \n"

+      "trn1      v4.8b, v6.8b, v7.8b                     \n"

+      "trn2      v5.8b, v6.8b, v7.8b                     \n"

+      "trn1      v16.8b, v18.8b, v19.8b                  \n"

+      "trn2      v17.8b, v18.8b, v19.8b                  \n"

-    // 00+10 01+11 02+12 03+13

-    // 40+50 41+51 42+52 43+53

-    "uaddlp    v20.4h, v20.8b                          \n"

-    "uaddlp    v21.4h, v21.8b                          \n"

-    "uaddlp    v22.4h, v22.8b                          \n"

-    "uaddlp    v23.4h, v23.8b                          \n"

-    "uaddlp    v24.4h, v24.8b                          \n"

-    "uaddlp    v25.4h, v25.8b                          \n"

+      // 00+10 01+11 02+12 03+13

+      // 40+50 41+51 42+52 43+53

+      "uaddlp    v20.4h, v20.8b                          \n"

+      "uaddlp    v21.4h, v21.8b                          \n"

+      "uaddlp    v22.4h, v22.8b                          \n"

+      "uaddlp    v23.4h, v23.8b                          \n"

+      "uaddlp    v24.4h, v24.8b                          \n"

+      "uaddlp    v25.4h, v25.8b                          \n"

-    // 60+70 61+71 62+72 63+73

-    "uaddlp    v1.4h, v1.8b                            \n"

-    "uaddlp    v5.4h, v5.8b                            \n"

-    "uaddlp    v17.4h, v17.8b                          \n"

+      // 60+70 61+71 62+72 63+73

+      "uaddlp    v1.4h, v1.8b                            \n"

+      "uaddlp    v5.4h, v5.8b                            \n"

+      "uaddlp    v17.4h, v17.8b                          \n"

-    // combine source lines

-    "add       v20.4h, v20.4h, v22.4h                  \n"

-    "add       v21.4h, v21.4h, v23.4h                  \n"

-    "add       v20.4h, v20.4h, v24.4h                  \n"

-    "add       v21.4h, v21.4h, v25.4h                  \n"

-    "add       v2.4h, v1.4h, v5.4h                     \n"

-    "add       v2.4h, v2.4h, v17.4h                    \n"

+      // combine source lines

+      "add       v20.4h, v20.4h, v22.4h                  \n"

+      "add       v21.4h, v21.4h, v23.4h                  \n"

+      "add       v20.4h, v20.4h, v24.4h                  \n"

+      "add       v21.4h, v21.4h, v25.4h                  \n"

+      "add       v2.4h, v1.4h, v5.4h                     \n"

+      "add       v2.4h, v2.4h, v17.4h                    \n"

-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]

-    //             + s[6 + st * 1] + s[7 + st * 1]

-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6

-    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"

-    "xtn       v2.8b,  v2.8h                           \n"

+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]

+      //             + s[6 + st * 1] + s[7 + st * 1]

+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6

+      "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"

+      "xtn       v2.8b,  v2.8h                           \n"

-    // Shuffle 2,3 reg around so that 2 can be added to the

-    //  0,1 reg and 3 can be added to the 4,5 reg. This

-    //  requires expanding from u8 to u16 as the 0,1 and 4,5

-    //  registers are already expanded. Then do transposes

-    //  to get aligned.

-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

-    "ushll     v16.8h, v16.8b, #0                      \n"

-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"

+      // Shuffle 2,3 reg around so that 2 can be added to the

+      //  0,1 reg and 3 can be added to the 4,5 reg. This

+      //  requires expanding from u8 to u16 as the 0,1 and 4,5

+      //  registers are already expanded. Then do transposes

+      //  to get aligned.

+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

+      "ushll     v16.8h, v16.8b, #0                      \n"

+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"

-    // combine source lines

-    "add       v0.8h, v0.8h, v16.8h                    \n"

+      // combine source lines

+      "add       v0.8h, v0.8h, v16.8h                    \n"

-    // xx 20 xx 21 xx 22 xx 23

-    // xx 30 xx 31 xx 32 xx 33

-    "trn1      v1.8h, v0.8h, v0.8h                     \n"

-    "trn2      v4.8h, v0.8h, v0.8h                     \n"

-    "xtn       v0.4h, v1.4s                            \n"

-    "xtn       v4.4h, v4.4s                            \n"

+      // xx 20 xx 21 xx 22 xx 23

+      // xx 30 xx 31 xx 32 xx 33

+      "trn1      v1.8h, v0.8h, v0.8h                     \n"

+      "trn2      v4.8h, v0.8h, v0.8h                     \n"

+      "xtn       v0.4h, v1.4s                            \n"

+      "xtn       v4.4h, v4.4s                            \n"

-    // 0+1+2, 3+4+5

-    "add       v20.8h, v20.8h, v0.8h                   \n"

-    "add       v21.8h, v21.8h, v4.8h                   \n"

+      // 0+1+2, 3+4+5

+      "add       v20.8h, v20.8h, v0.8h                   \n"

+      "add       v21.8h, v21.8h, v4.8h                   \n"

-    // Need to divide, but can't downshift as the the value

-    //  isn't a power of 2. So multiply by 65536 / n

-    //  and take the upper 16 bits.

-    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"

-    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"

+      // Need to divide, but can't downshift as the the value

+      //  isn't a power of 2. So multiply by 65536 / n

+      //  and take the upper 16 bits.

+      "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"

+      "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"

-    // Align for table lookup, vtbl requires registers to

-    //  be adjacent

-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"

+      // Align for table lookup, vtbl requires registers to be adjacent

+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"

-    MEMACCESS(1)

-    "st1       {v3.8b}, [%1], #8                       \n"

-    MEMACCESS(1)

-    "st1       {v3.s}[2], [%1], #4                     \n"

-    "b.gt      1b                                      \n"

-  : "+r"(src_ptr),          // %0

-    "+r"(dst_ptr),          // %1

-    "+r"(tmp_src_stride),   // %2

-    "+r"(src_ptr1),         // %3

-    "+r"(dst_width)         // %4

-  : "r"(&kMult38_Div6),     // %5

-    "r"(&kShuf38_2),        // %6

-    "r"(&kMult38_Div9)      // %7

-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",

-    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",

-    "v30", "v31", "memory", "cc"

-  );

+      "st1       {v3.8b}, [%1], #8                       \n"

+      "st1       {v3.s}[2], [%1], #4                     \n"

+      "b.gt      1b                                      \n"

+      : "+r"(src_ptr),         // %0

+        "+r"(dst_ptr),         // %1

+        "+r"(tmp_src_stride),  // %2

+        "+r"(src_ptr1),        // %3

+        "+r"(dst_width)        // %4

+      : "r"(&kMult38_Div6),    // %5

+        "r"(&kShuf38_2),       // %6

+        "r"(&kMult38_Div9)     // %7

+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",

+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",

+        "memory", "cc");

 // 32x2 -> 12x1

-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,

+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,

                                ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

+                               uint8_t* dst_ptr,

+                               int dst_width) {

   // TODO(fbarchard): use src_stride directly for clang 3.5+.

   ptrdiff_t tmp_src_stride = src_stride;

-  asm volatile (

-    MEMACCESS(4)

-    "ld1       {v30.8h}, [%4]                          \n"

-    MEMACCESS(5)

-    "ld1       {v31.16b}, [%5]                         \n"

-    "add       %2, %2, %0                              \n"

-  "1:                                                  \n"

+  asm volatile(

+      "ld1       {v30.8h}, [%4]                          \n"

+      "ld1       {v31.16b}, [%5]                         \n"

+      "add       %2, %2, %0                              \n"

+      "1:                                                \n"

-    // 00 40 01 41 02 42 03 43

-    // 10 50 11 51 12 52 13 53

-    // 20 60 21 61 22 62 23 63

-    // 30 70 31 71 32 72 33 73

-    MEMACCESS(0)

-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"

-    MEMACCESS(3)

-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"

-    "subs      %w3, %w3, #12                           \n"

+      // 00 40 01 41 02 42 03 43

+      // 10 50 11 51 12 52 13 53

+      // 20 60 21 61 22 62 23 63

+      // 30 70 31 71 32 72 33 73

+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"

+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"

+      "subs      %w3, %w3, #12                           \n"

-    // Shuffle the input data around to get align the data

-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

-    // 00 10 01 11 02 12 03 13

-    // 40 50 41 51 42 52 43 53

-    "trn1      v16.8b, v0.8b, v1.8b                    \n"

-    "trn2      v17.8b, v0.8b, v1.8b                    \n"

-    "trn1      v18.8b, v4.8b, v5.8b                    \n"

-    "trn2      v19.8b, v4.8b, v5.8b                    \n"

+      // Shuffle the input data around to get align the data

+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7

+      // 00 10 01 11 02 12 03 13

+      // 40 50 41 51 42 52 43 53

+      "trn1      v16.8b, v0.8b, v1.8b                    \n"

+      "trn2      v17.8b, v0.8b, v1.8b                    \n"

+      "trn1      v18.8b, v4.8b, v5.8b                    \n"

+      "trn2      v19.8b, v4.8b, v5.8b                    \n"

-    // 20 30 21 31 22 32 23 33

-    // 60 70 61 71 62 72 63 73

-    "trn1      v0.8b, v2.8b, v3.8b                     \n"

-    "trn2      v1.8b, v2.8b, v3.8b                     \n"

-    "trn1      v4.8b, v6.8b, v7.8b                     \n"

-    "trn2      v5.8b, v6.8b, v7.8b                     \n"

+      // 20 30 21 31 22 32 23 33

+      // 60 70 61 71 62 72 63 73

+      "trn1      v0.8b, v2.8b, v3.8b                     \n"

+      "trn2      v1.8b, v2.8b, v3.8b                     \n"

+      "trn1      v4.8b, v6.8b, v7.8b                     \n"

+      "trn2      v5.8b, v6.8b, v7.8b                     \n"

-    // 00+10 01+11 02+12 03+13

-    // 40+50 41+51 42+52 43+53

-    "uaddlp    v16.4h, v16.8b                          \n"

-    "uaddlp    v17.4h, v17.8b                          \n"

-    "uaddlp    v18.4h, v18.8b                          \n"

-    "uaddlp    v19.4h, v19.8b                          \n"

+      // 00+10 01+11 02+12 03+13

+      // 40+50 41+51 42+52 43+53

+      "uaddlp    v16.4h, v16.8b                          \n"

+      "uaddlp    v17.4h, v17.8b                          \n"

+      "uaddlp    v18.4h, v18.8b                          \n"

+      "uaddlp    v19.4h, v19.8b                          \n"

-    // 60+70 61+71 62+72 63+73

-    "uaddlp    v1.4h, v1.8b                            \n"

-    "uaddlp    v5.4h, v5.8b                            \n"

+      // 60+70 61+71 62+72 63+73

+      "uaddlp    v1.4h, v1.8b                            \n"

+      "uaddlp    v5.4h, v5.8b                            \n"

-    // combine source lines

-    "add       v16.4h, v16.4h, v18.4h                  \n"

-    "add       v17.4h, v17.4h, v19.4h                  \n"

-    "add       v2.4h, v1.4h, v5.4h                     \n"

+      // combine source lines

+      "add       v16.4h, v16.4h, v18.4h                  \n"

+      "add       v17.4h, v17.4h, v19.4h                  \n"

+      "add       v2.4h, v1.4h, v5.4h                     \n"

-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4

-    "uqrshrn   v2.8b, v2.8h, #2                        \n"

+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4

+      "uqrshrn   v2.8b, v2.8h, #2                        \n"

-    // Shuffle 2,3 reg around so that 2 can be added to the

-    //  0,1 reg and 3 can be added to the 4,5 reg. This

-    //  requires expanding from u8 to u16 as the 0,1 and 4,5

-    //  registers are already expanded. Then do transposes

-    //  to get aligned.

-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

+      // Shuffle 2,3 reg around so that 2 can be added to the

+      //  0,1 reg and 3 can be added to the 4,5 reg. This

+      //  requires expanding from u8 to u16 as the 0,1 and 4,5

+      //  registers are already expanded. Then do transposes

+      //  to get aligned.

+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33

-    // combine source lines

-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"

+      // combine source lines

+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"

-    // xx 20 xx 21 xx 22 xx 23

-    // xx 30 xx 31 xx 32 xx 33

-    "trn1      v1.8h, v0.8h, v0.8h                     \n"

-    "trn2      v4.8h, v0.8h, v0.8h                     \n"

-    "xtn       v0.4h, v1.4s                            \n"

-    "xtn       v4.4h, v4.4s                            \n"

+      // xx 20 xx 21 xx 22 xx 23

+      // xx 30 xx 31 xx 32 xx 33

+      "trn1      v1.8h, v0.8h, v0.8h                     \n"

+      "trn2      v4.8h, v0.8h, v0.8h                     \n"

+      "xtn       v0.4h, v1.4s                            \n"

+      "xtn       v4.4h, v4.4s                            \n"

-    // 0+1+2, 3+4+5

-    "add       v16.8h, v16.8h, v0.8h                   \n"

-    "add       v17.8h, v17.8h, v4.8h                   \n"

+      // 0+1+2, 3+4+5

+      "add       v16.8h, v16.8h, v0.8h                   \n"

+      "add       v17.8h, v17.8h, v4.8h                   \n"

-    // Need to divide, but can't downshift as the the value

-    //  isn't a power of 2. So multiply by 65536 / n

-    //  and take the upper 16 bits.

-    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"

-    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"

+      // Need to divide, but can't downshift as the the value

+      //  isn't a power of 2. So multiply by 65536 / n

+      //  and take the upper 16 bits.

+      "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"

+      "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"

-    // Align for table lookup, vtbl requires registers to

-    //  be adjacent

+      // Align for table lookup, vtbl requires registers to

+      //  be adjacent

-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"

+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"

-    MEMACCESS(1)

-    "st1       {v3.8b}, [%1], #8                       \n"

-    MEMACCESS(1)

-    "st1       {v3.s}[2], [%1], #4                     \n"

-    "b.gt      1b                                      \n"

-  : "+r"(src_ptr),         // %0

-    "+r"(dst_ptr),         // %1

-    "+r"(tmp_src_stride),  // %2

-    "+r"(dst_width)        // %3

-  : "r"(&kMult38_Div6),    // %4

-    "r"(&kShuf38_2)        // %5

-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",

-    "v18", "v19", "v30", "v31", "memory", "cc"

-  );

+      "st1       {v3.8b}, [%1], #8                       \n"

+      "st1       {v3.s}[2], [%1], #4                     \n"

+      "b.gt      1b                                      \n"

+      : "+r"(src_ptr),         // %0

+        "+r"(dst_ptr),         // %1

+        "+r"(tmp_src_stride),  // %2

+        "+r"(dst_width)        // %3

+      : "r"(&kMult38_Div6),    // %4

+        "r"(&kShuf38_2)        // %5

+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",

+        "v19", "v30", "v31", "memory", "cc");

-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                    uint16* dst_ptr, int src_width, int src_height) {

-  const uint8* src_tmp;

-  asm volatile (

-  "1:                                          \n"

-    "mov       %0, %1                          \n"

-    "mov       w12, %w5                        \n"

-    "eor       v2.16b, v2.16b, v2.16b          \n"

-    "eor       v3.16b, v3.16b, v3.16b          \n"

-  "2:                                          \n"

-    // load 16 pixels into q0

-    MEMACCESS(0)

-    "ld1       {v0.16b}, [%0], %3              \n"

-    "uaddw2    v3.8h, v3.8h, v0.16b            \n"

-    "uaddw     v2.8h, v2.8h, v0.8b             \n"

-    "subs      w12, w12, #1                    \n"

-    "b.gt      2b                              \n"

-    MEMACCESS(2)

-    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels

-    "add      %1, %1, #16                      \n"

-    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop

-    "b.gt     1b                               \n"

-  : "=&r"(src_tmp),    // %0

-    "+r"(src_ptr),     // %1

-    "+r"(dst_ptr),     // %2

-    "+r"(src_stride),  // %3

-    "+r"(src_width),   // %4

-    "+r"(src_height)   // %5

-  :

-  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+void ScaleAddRows_NEON(const uint8_t* src_ptr,

+                       ptrdiff_t src_stride,

+                       uint16_t* dst_ptr,

+                       int src_width,

+                       int src_height) {

+  const uint8_t* src_tmp;

+  asm volatile(

+      "1:                                        \n"

+      "mov       %0, %1                          \n"

+      "mov       w12, %w5                        \n"

+      "eor       v2.16b, v2.16b, v2.16b          \n"

+      "eor       v3.16b, v3.16b, v3.16b          \n"

+      "2:                                        \n"

+      // load 16 pixels into q0

+      "ld1       {v0.16b}, [%0], %3              \n"

+      "uaddw2    v3.8h, v3.8h, v0.16b            \n"

+      "uaddw     v2.8h, v2.8h, v0.8b             \n"

+      "subs      w12, w12, #1                    \n"

+      "b.gt      2b                              \n"

+      "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels

+      "add      %1, %1, #16                      \n"

+      "subs     %w4, %w4, #16                    \n"  // 16 processed per loop

+      "b.gt     1b                               \n"

+      : "=&r"(src_tmp),    // %0

+        "+r"(src_ptr),     // %1

+        "+r"(dst_ptr),     // %2

+        "+r"(src_stride),  // %3

+        "+r"(src_width),   // %4

+        "+r"(src_height)   // %5

+      :

+      : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List

+      );

 // TODO(Yang Zhang): Investigate less load instructions for

 // the x/dx stepping

-#define LOAD2_DATA8_LANE(n)                                    \

-    "lsr        %5, %3, #16                    \n"             \

-    "add        %6, %1, %5                    \n"              \

-    "add        %3, %3, %4                     \n"             \

-    MEMACCESS(6)                                               \

-    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"

+#define LOAD2_DATA8_LANE(n)                      \

+  "lsr        %5, %3, #16                    \n" \

+  "add        %6, %1, %5                     \n" \

+  "add        %3, %3, %4                     \n" \

+  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"

-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,

-                          int dst_width, int x, int dx) {

+// The NEON version mimics this formula (from row_common.cc):

+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +

+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))

+void ScaleFilterCols_NEON(uint8_t* dst_ptr,

+                          const uint8_t* src_ptr,

+                          int dst_width,

+                          int x,

+                          int dx) {

   int dx_offset[4] = {0, 1, 2, 3};

   int* tmp = dx_offset;

-  const uint8* src_tmp = src_ptr;

-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.

-  int64 x64 = (int64) x;

-  int64 dx64 = (int64) dx;

+  const uint8_t* src_tmp = src_ptr;

+  int64_t x64 = (int64_t)x;    // NOLINT

+  int64_t dx64 = (int64_t)dx;  // NOLINT

   asm volatile (

     "dup        v0.4s, %w3                     \n"  // x

     "dup        v1.4s, %w4                     \n"  // dx

@@ -626,12 +602,11 @@

     "ushll2    v6.4s, v6.8h, #0                \n"

     "mul       v16.4s, v16.4s, v7.4s           \n"

     "mul       v17.4s, v17.4s, v6.4s           \n"

-    "rshrn      v6.4h, v16.4s, #16             \n"

-    "rshrn2     v6.8h, v17.4s, #16             \n"

+    "rshrn     v6.4h, v16.4s, #16              \n"

+    "rshrn2    v6.8h, v17.4s, #16              \n"

     "add       v4.8h, v4.8h, v6.8h             \n"

     "xtn       v4.8b, v4.8h                    \n"

-    MEMACCESS(0)

     "st1       {v4.8b}, [%0], #8               \n"  // store pixels

     "add       v1.4s, v1.4s, v0.4s             \n"

     "add       v2.4s, v2.4s, v0.4s             \n"

@@ -639,7 +614,7 @@

     "b.gt      1b                              \n"

   : "+r"(dst_ptr),          // %0

     "+r"(src_ptr),          // %1

-    "+r"(dst_width64),      // %2

+    "+r"(dst_width),        // %2

     "+r"(x64),              // %3

     "+r"(dx64),             // %4

     "+r"(tmp),              // %5

@@ -653,214 +628,190 @@

 #undef LOAD2_DATA8_LANE

 // 16x2 -> 16x1

-void ScaleFilterRows_NEON(uint8* dst_ptr,

-                          const uint8* src_ptr, ptrdiff_t src_stride,

-                          int dst_width, int source_y_fraction) {

-    int y_fraction = 256 - source_y_fraction;

-  asm volatile (

-    "cmp          %w4, #0                      \n"

-    "b.eq         100f                         \n"

-    "add          %2, %2, %1                   \n"

-    "cmp          %w4, #64                     \n"

-    "b.eq         75f                          \n"

-    "cmp          %w4, #128                    \n"

-    "b.eq         50f                          \n"

-    "cmp          %w4, #192                    \n"

-    "b.eq         25f                          \n"

+void ScaleFilterRows_NEON(uint8_t* dst_ptr,

+                          const uint8_t* src_ptr,

+                          ptrdiff_t src_stride,

+                          int dst_width,

+                          int source_y_fraction) {

+  int y_fraction = 256 - source_y_fraction;

+  asm volatile(

+      "cmp          %w4, #0                      \n"

+      "b.eq         100f                         \n"

+      "add          %2, %2, %1                   \n"

+      "cmp          %w4, #64                     \n"

+      "b.eq         75f                          \n"

+      "cmp          %w4, #128                    \n"

+      "b.eq         50f                          \n"

+      "cmp          %w4, #192                    \n"

+      "b.eq         25f                          \n"

-    "dup          v5.8b, %w4                   \n"

-    "dup          v4.8b, %w5                   \n"

-    // General purpose row blend.

-  "1:                                          \n"

-    MEMACCESS(1)

-    "ld1          {v0.16b}, [%1], #16          \n"

-    MEMACCESS(2)

-    "ld1          {v1.16b}, [%2], #16          \n"

-    "subs         %w3, %w3, #16                \n"

-    "umull        v6.8h, v0.8b, v4.8b          \n"

-    "umull2       v7.8h, v0.16b, v4.16b        \n"

-    "umlal        v6.8h, v1.8b, v5.8b          \n"

-    "umlal2       v7.8h, v1.16b, v5.16b        \n"

-    "rshrn        v0.8b, v6.8h, #8             \n"

-    "rshrn2       v0.16b, v7.8h, #8            \n"

-    MEMACCESS(0)

-    "st1          {v0.16b}, [%0], #16          \n"

-    "b.gt         1b                           \n"

-    "b            99f                          \n"

+      "dup          v5.8b, %w4                   \n"

+      "dup          v4.8b, %w5                   \n"

+      // General purpose row blend.

+      "1:                                        \n"

+      "ld1          {v0.16b}, [%1], #16          \n"

+      "ld1          {v1.16b}, [%2], #16          \n"

+      "subs         %w3, %w3, #16                \n"

+      "umull        v6.8h, v0.8b, v4.8b          \n"

+      "umull2       v7.8h, v0.16b, v4.16b        \n"

+      "umlal        v6.8h, v1.8b, v5.8b          \n"

+      "umlal2       v7.8h, v1.16b, v5.16b        \n"

+      "rshrn        v0.8b, v6.8h, #8             \n"

+      "rshrn2       v0.16b, v7.8h, #8            \n"

+      "st1          {v0.16b}, [%0], #16          \n"

+      "b.gt         1b                           \n"

+      "b            99f                          \n"

-    // Blend 25 / 75.

-  "25:                                         \n"

-    MEMACCESS(1)

-    "ld1          {v0.16b}, [%1], #16          \n"

-    MEMACCESS(2)

-    "ld1          {v1.16b}, [%2], #16          \n"

-    "subs         %w3, %w3, #16                \n"

-    "urhadd       v0.16b, v0.16b, v1.16b       \n"

-    "urhadd       v0.16b, v0.16b, v1.16b       \n"

-    MEMACCESS(0)

-    "st1          {v0.16b}, [%0], #16          \n"

-    "b.gt         25b                          \n"

-    "b            99f                          \n"

+      // Blend 25 / 75.

+      "25:                                       \n"

+      "ld1          {v0.16b}, [%1], #16          \n"

+      "ld1          {v1.16b}, [%2], #16          \n"

+      "subs         %w3, %w3, #16                \n"

+      "urhadd       v0.16b, v0.16b, v1.16b       \n"

+      "urhadd       v0.16b, v0.16b, v1.16b       \n"

+      "st1          {v0.16b}, [%0], #16          \n"

+      "b.gt         25b                          \n"

+      "b            99f                          \n"

-    // Blend 50 / 50.

-  "50:                                         \n"

-    MEMACCESS(1)

-    "ld1          {v0.16b}, [%1], #16          \n"

-    MEMACCESS(2)

-    "ld1          {v1.16b}, [%2], #16          \n"

-    "subs         %w3, %w3, #16                \n"

-    "urhadd       v0.16b, v0.16b, v1.16b       \n"

-    MEMACCESS(0)

-    "st1          {v0.16b}, [%0], #16          \n"

-    "b.gt         50b                          \n"

-    "b            99f                          \n"

+      // Blend 50 / 50.

+      "50:                                       \n"

+      "ld1          {v0.16b}, [%1], #16          \n"

+      "ld1          {v1.16b}, [%2], #16          \n"

+      "subs         %w3, %w3, #16                \n"

+      "urhadd       v0.16b, v0.16b, v1.16b       \n"

+      "st1          {v0.16b}, [%0], #16          \n"

+      "b.gt         50b                          \n"

+      "b            99f                          \n"

-    // Blend 75 / 25.

-  "75:                                         \n"

-    MEMACCESS(1)

-    "ld1          {v1.16b}, [%1], #16          \n"

-    MEMACCESS(2)

-    "ld1          {v0.16b}, [%2], #16          \n"

-    "subs         %w3, %w3, #16                \n"

-    "urhadd       v0.16b, v0.16b, v1.16b       \n"

-    "urhadd       v0.16b, v0.16b, v1.16b       \n"

-    MEMACCESS(0)

-    "st1          {v0.16b}, [%0], #16          \n"

-    "b.gt         75b                          \n"

-    "b            99f                          \n"

+      // Blend 75 / 25.

+      "75:                                       \n"

+      "ld1          {v1.16b}, [%1], #16          \n"

+      "ld1          {v0.16b}, [%2], #16          \n"

+      "subs         %w3, %w3, #16                \n"

+      "urhadd       v0.16b, v0.16b, v1.16b       \n"

+      "urhadd       v0.16b, v0.16b, v1.16b       \n"

+      "st1          {v0.16b}, [%0], #16          \n"

+      "b.gt         75b                          \n"

+      "b            99f                          \n"

-    // Blend 100 / 0 - Copy row unchanged.

-  "100:                                        \n"

-    MEMACCESS(1)

-    "ld1          {v0.16b}, [%1], #16          \n"

-    "subs         %w3, %w3, #16                \n"

-    MEMACCESS(0)

-    "st1          {v0.16b}, [%0], #16          \n"

-    "b.gt         100b                         \n"

+      // Blend 100 / 0 - Copy row unchanged.

+      "100:                                      \n"

+      "ld1          {v0.16b}, [%1], #16          \n"

+      "subs         %w3, %w3, #16                \n"

+      "st1          {v0.16b}, [%0], #16          \n"

+      "b.gt         100b                         \n"

-  "99:                                         \n"

-    MEMACCESS(0)

-    "st1          {v0.b}[15], [%0]             \n"

-  : "+r"(dst_ptr),          // %0

-    "+r"(src_ptr),          // %1

-    "+r"(src_stride),       // %2

-    "+r"(dst_width),        // %3

-    "+r"(source_y_fraction),// %4

-    "+r"(y_fraction)        // %5

-  :

-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"

-  );

+      "99:                                       \n"

+      "st1          {v0.b}[15], [%0]             \n"

+      : "+r"(dst_ptr),            // %0

+        "+r"(src_ptr),            // %1

+        "+r"(src_stride),         // %2

+        "+r"(dst_width),          // %3

+        "+r"(source_y_fraction),  // %4

+        "+r"(y_fraction)          // %5

+      :

+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");

-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    // load even pixels into q0, odd into q1

-    MEMACCESS (0)

-    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"

-    MEMACCESS (0)

-    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

-    MEMACCESS (1)

-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels

-    MEMACCESS (1)

-    "st1        {v3.16b}, [%1], #16            \n"

-    "b.gt       1b                             \n"

-  : "+r" (src_ptr),          // %0

-    "+r" (dst),              // %1

-    "+r" (dst_width)         // %2

-  :

-  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List

-  );

+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,

+                            ptrdiff_t src_stride,

+                            uint8_t* dst,

+                            int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3

+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

+      "mov        v2.16b, v3.16b                 \n"

+      "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),   // %0

+        "+r"(dst),       // %1

+        "+r"(dst_width)  // %2

+      :

+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List

+      );

-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,

-                                  uint8* dst_argb, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS (0)

-    // load 8 ARGB pixels.

-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.

-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.

-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack

-    "rshrn      v1.8b, v1.8h, #1               \n"

-    "rshrn      v2.8b, v2.8h, #1               \n"

-    "rshrn      v3.8b, v3.8h, #1               \n"

-    MEMACCESS (1)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),         // %0

-    "+r"(dst_argb),         // %1

-    "+r"(dst_width)         // %2

-  :

-  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List

-  );

+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

+                                  uint8_t* dst_argb,

+                                  int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3

+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add

+      "urhadd     v1.16b, v2.16b, v3.16b         \n"

+      "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),  // %0

+        "+r"(dst_argb),  // %1

+        "+r"(dst_width)  // %2

+      :

+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List

+      );

-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst, int dst_width) {

-  asm volatile (

-    // change the stride to row 2 pointer

-    "add        %1, %1, %0                     \n"

-  "1:                                          \n"

-    MEMACCESS (0)

-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.

-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.

-    MEMACCESS (1)

-    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.

-    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.

-    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.

-    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.

-    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.

-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack

-    "rshrn      v1.8b, v1.8h, #2               \n"

-    "rshrn      v2.8b, v2.8h, #2               \n"

-    "rshrn      v3.8b, v3.8h, #2               \n"

-    MEMACCESS (2)

-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"

-    "b.gt       1b                             \n"

-  : "+r" (src_ptr),          // %0

-    "+r" (src_stride),       // %1

-    "+r" (dst),              // %2

-    "+r" (dst_width)         // %3

-  :

-  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"

-  );

+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,

+                               ptrdiff_t src_stride,

+                               uint8_t* dst,

+                               int dst_width) {

+  asm volatile(

+      // change the stride to row 2 pointer

+      "add        %1, %1, %0                     \n"

+      "1:                                        \n"

+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.

+      "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.

+      "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.

+      "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.

+      "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.

+      "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8

+      "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.

+      "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.

+      "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.

+      "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.

+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack

+      "rshrn      v1.8b, v1.8h, #2               \n"

+      "rshrn      v2.8b, v2.8h, #2               \n"

+      "rshrn      v3.8b, v3.8h, #2               \n"

+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),     // %0

+        "+r"(src_stride),  // %1

+        "+r"(dst),         // %2

+        "+r"(dst_width)    // %3

+      :

+      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");

 // Reads 4 pixels at a time.

 // Alignment requirement: src_argb 4 byte aligned.

-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,

-                               int src_stepx, uint8* dst_argb, int dst_width) {

-  asm volatile (

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.s}[0], [%0], %3            \n"

-    MEMACCESS(0)

-    "ld1        {v0.s}[1], [%0], %3            \n"

-    MEMACCESS(0)

-    "ld1        {v0.s}[2], [%0], %3            \n"

-    MEMACCESS(0)

-    "ld1        {v0.s}[3], [%0], %3            \n"

-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.

-    MEMACCESS(1)

-    "st1        {v0.16b}, [%1], #16            \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),    // %0

-    "+r"(dst_argb),    // %1

-    "+r"(dst_width)    // %2

-  : "r"((int64)(src_stepx * 4)) // %3

-  : "memory", "cc", "v0"

-  );

+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,

+                               ptrdiff_t src_stride,

+                               int src_stepx,

+                               uint8_t* dst_argb,

+                               int dst_width) {

+  (void)src_stride;

+  asm volatile(

+      "1:                                        \n"

+      "ld1        {v0.s}[0], [%0], %3            \n"

+      "ld1        {v0.s}[1], [%0], %3            \n"

+      "ld1        {v0.s}[2], [%0], %3            \n"

+      "ld1        {v0.s}[3], [%0], %3            \n"

+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.

+      "st1        {v0.16b}, [%1], #16            \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),                // %0

+        "+r"(dst_argb),                // %1

+        "+r"(dst_width)                // %2

+      : "r"((int64_t)(src_stepx * 4))  // %3

+      : "memory", "cc", "v0");

 // Reads 4 pixels at a time.

@@ -867,96 +818,88 @@

 // Alignment requirement: src_argb 4 byte aligned.

 // TODO(Yang Zhang): Might be worth another optimization pass in future.

 // It could be upgraded to 8 pixels at a time to start with.

-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,

+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,

+                                  ptrdiff_t src_stride,

                                   int src_stepx,

-                                  uint8* dst_argb, int dst_width) {

-  asm volatile (

-    "add        %1, %1, %0                     \n"

-  "1:                                          \n"

-    MEMACCESS(0)

-    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1

-    MEMACCESS(1)

-    "ld1        {v1.8b}, [%1], %4              \n"

-    MEMACCESS(0)

-    "ld1        {v2.8b}, [%0], %4              \n"

-    MEMACCESS(1)

-    "ld1        {v3.8b}, [%1], %4              \n"

-    MEMACCESS(0)

-    "ld1        {v4.8b}, [%0], %4              \n"

-    MEMACCESS(1)

-    "ld1        {v5.8b}, [%1], %4              \n"

-    MEMACCESS(0)

-    "ld1        {v6.8b}, [%0], %4              \n"

-    MEMACCESS(1)

-    "ld1        {v7.8b}, [%1], %4              \n"

-    "uaddl      v0.8h, v0.8b, v1.8b            \n"

-    "uaddl      v2.8h, v2.8b, v3.8b            \n"

-    "uaddl      v4.8h, v4.8b, v5.8b            \n"

-    "uaddl      v6.8h, v6.8b, v7.8b            \n"

-    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd

-    "mov        v0.d[1], v2.d[0]               \n"

-    "mov        v2.d[0], v16.d[1]              \n"

-    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh

-    "mov        v4.d[1], v6.d[0]               \n"

-    "mov        v6.d[0], v16.d[1]              \n"

-    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)

-    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)

-    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.

-    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.

-    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.

-    MEMACCESS(2)

-    "st1     {v0.16b}, [%2], #16               \n"

-    "b.gt       1b                             \n"

-  : "+r"(src_argb),    // %0

-    "+r"(src_stride),  // %1

-    "+r"(dst_argb),    // %2

-    "+r"(dst_width)    // %3

-  : "r"((int64)(src_stepx * 4)) // %4

-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"

-  );

+                                  uint8_t* dst_argb,

+                                  int dst_width) {

+  asm volatile(

+      "add        %1, %1, %0                     \n"

+      "1:                                        \n"

+      "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1

+      "ld1        {v1.8b}, [%1], %4              \n"

+      "ld1        {v2.8b}, [%0], %4              \n"

+      "ld1        {v3.8b}, [%1], %4              \n"

+      "ld1        {v4.8b}, [%0], %4              \n"

+      "ld1        {v5.8b}, [%1], %4              \n"

+      "ld1        {v6.8b}, [%0], %4              \n"

+      "ld1        {v7.8b}, [%1], %4              \n"

+      "uaddl      v0.8h, v0.8b, v1.8b            \n"

+      "uaddl      v2.8h, v2.8b, v3.8b            \n"

+      "uaddl      v4.8h, v4.8b, v5.8b            \n"

+      "uaddl      v6.8h, v6.8b, v7.8b            \n"

+      "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd

+      "mov        v0.d[1], v2.d[0]               \n"

+      "mov        v2.d[0], v16.d[1]              \n"

+      "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh

+      "mov        v4.d[1], v6.d[0]               \n"

+      "mov        v6.d[0], v16.d[1]              \n"

+      "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)

+      "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)

+      "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.

+      "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.

+      "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.

+      "st1     {v0.16b}, [%2], #16               \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_argb),                // %0

+        "+r"(src_stride),              // %1

+        "+r"(dst_argb),                // %2

+        "+r"(dst_width)                // %3

+      : "r"((int64_t)(src_stepx * 4))  // %4

+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");

 // TODO(Yang Zhang): Investigate less load instructions for

 // the x/dx stepping

-#define LOAD1_DATA32_LANE(vn, n)                               \

-    "lsr        %5, %3, #16                    \n"             \

-    "add        %6, %1, %5, lsl #2             \n"             \

-    "add        %3, %3, %4                     \n"             \

-    MEMACCESS(6)                                               \

-    "ld1        {"#vn".s}["#n"], [%6]          \n"

+#define LOAD1_DATA32_LANE(vn, n)                 \

+  "lsr        %5, %3, #16                    \n" \

+  "add        %6, %1, %5, lsl #2             \n" \

+  "add        %3, %3, %4                     \n" \

+  "ld1        {" #vn ".s}[" #n "], [%6]      \n"

-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int x, int dx) {

-  const uint8* src_tmp = src_argb;

-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.

-  int64 x64 = (int64) x;

-  int64 dx64 = (int64) dx;

-  int64 tmp64;

-  asm volatile (

-  "1:                                          \n"

-    LOAD1_DATA32_LANE(v0, 0)

-    LOAD1_DATA32_LANE(v0, 1)

-    LOAD1_DATA32_LANE(v0, 2)

-    LOAD1_DATA32_LANE(v0, 3)

-    LOAD1_DATA32_LANE(v1, 0)

-    LOAD1_DATA32_LANE(v1, 1)

-    LOAD1_DATA32_LANE(v1, 2)

-    LOAD1_DATA32_LANE(v1, 3)

-    MEMACCESS(0)

-    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels

-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

-    "b.gt        1b                            \n"

-  : "+r"(dst_argb),     // %0

-    "+r"(src_argb),     // %1

-    "+r"(dst_width64),  // %2

-    "+r"(x64),          // %3

-    "+r"(dx64),         // %4

-    "=&r"(tmp64),       // %5

-    "+r"(src_tmp)       // %6

-  :

-  : "memory", "cc", "v0", "v1"

-  );

+void ScaleARGBCols_NEON(uint8_t* dst_argb,

+                        const uint8_t* src_argb,

+                        int dst_width,

+                        int x,

+                        int dx) {

+  const uint8_t* src_tmp = src_argb;

+  int64_t x64 = (int64_t)x;    // NOLINT

+  int64_t dx64 = (int64_t)dx;  // NOLINT

+  int64_t tmp64;

+  asm volatile(

+      "1:                                        \n"

+      // clang-format off

+      LOAD1_DATA32_LANE(v0, 0)

+      LOAD1_DATA32_LANE(v0, 1)

+      LOAD1_DATA32_LANE(v0, 2)

+      LOAD1_DATA32_LANE(v0, 3)

+      LOAD1_DATA32_LANE(v1, 0)

+      LOAD1_DATA32_LANE(v1, 1)

+      LOAD1_DATA32_LANE(v1, 2)

+      LOAD1_DATA32_LANE(v1, 3)

+      // clang-format on

+      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels

+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop

+      "b.gt       1b                             \n"

+      : "+r"(dst_argb),   // %0

+        "+r"(src_argb),   // %1

+        "+r"(dst_width),  // %2

+        "+r"(x64),        // %3

+        "+r"(dx64),       // %4

+        "=&r"(tmp64),     // %5

+        "+r"(src_tmp)     // %6

+      :

+      : "memory", "cc", "v0", "v1");

 #undef LOAD1_DATA32_LANE

@@ -963,21 +906,22 @@

 // TODO(Yang Zhang): Investigate less load instructions for

 // the x/dx stepping

-#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \

-    "lsr        %5, %3, #16                           \n"      \

-    "add        %6, %1, %5, lsl #2                    \n"      \

-    "add        %3, %3, %4                            \n"      \

-    MEMACCESS(6)                                               \

-    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"

+#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \

+  "lsr        %5, %3, #16                           \n" \

+  "add        %6, %1, %5, lsl #2                    \n" \

+  "add        %3, %3, %4                            \n" \

+  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"

-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,

-                              int dst_width, int x, int dx) {

+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,

+                              const uint8_t* src_argb,

+                              int dst_width,

+                              int x,

+                              int dx) {

   int dx_offset[4] = {0, 1, 2, 3};

   int* tmp = dx_offset;

-  const uint8* src_tmp = src_argb;

-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.

-  int64 x64 = (int64) x;

-  int64 dx64 = (int64) dx;

+  const uint8_t* src_tmp = src_argb;

+  int64_t x64 = (int64_t)x;    // NOLINT

+  int64_t dx64 = (int64_t)dx;  // NOLINT

   asm volatile (

     "dup        v0.4s, %w3                     \n"  // x

     "dup        v1.4s, %w4                     \n"  // dx

@@ -1014,7 +958,6 @@

     "shrn       v0.8b, v16.8h, #7              \n"

     "shrn2      v0.16b, v17.8h, #7             \n"

-    MEMACCESS(0)

     "st1     {v0.4s}, [%0], #16                \n"  // store pixels

     "add     v5.4s, v5.4s, v6.4s               \n"

     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop

@@ -1021,7 +964,7 @@

     "b.gt    1b                                \n"

   : "+r"(dst_argb),         // %0

     "+r"(src_argb),         // %1

-    "+r"(dst_width64),      // %2

+    "+r"(dst_width),        // %2

     "+r"(x64),              // %3

     "+r"(dx64),             // %4

     "+r"(tmp),              // %5

@@ -1033,6 +976,85 @@

 #undef LOAD2_DATA32_LANE

+// Read 16x2 average down and write 8x1.

+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,

+                              ptrdiff_t src_stride,

+                              uint16_t* dst,

+                              int dst_width) {

+  asm volatile(

+      // change the stride to row 2 pointer

+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2

+      "1:                                        \n"

+      "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc

+      "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc

+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop

+      "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent

+      "uaddlp     v1.4s, v1.8h                   \n"

+      "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent

+      "uadalp     v1.4s, v3.8h                   \n"

+      "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack

+      "rshrn2     v0.8h, v1.4s, #2               \n"

+      "st1        {v0.8h}, [%2], #16             \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),     // %0

+        "+r"(src_stride),  // %1

+        "+r"(dst),         // %2

+        "+r"(dst_width)    // %3

+      :

+      : "v0", "v1", "v2", "v3"  // Clobber List

+      );

+}

+// Read 8x2 upsample with filtering and write 16x1.

+// Actually reads an extra pixel, so 9x2.

+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,

+                         ptrdiff_t src_stride,

+                         uint16_t* dst,

+                         int dst_width) {

+  asm volatile(

+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2

+      "movi       v0.8h, #9                      \n"  // constants

+      "movi       v1.4s, #3                      \n"

+      "1:                                        \n"

+      "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8

+      "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1

+      "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row

+      "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1

+      "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop

+      "umull      v16.4s, v3.4h, v0.4h           \n"

+      "umull2     v7.4s, v3.8h, v0.8h            \n"

+      "umull      v18.4s, v4.4h, v0.4h           \n"

+      "umull2     v17.4s, v4.8h, v0.8h           \n"

+      "uaddw      v16.4s, v16.4s, v6.4h          \n"

+      "uaddl2     v19.4s, v6.8h, v3.8h           \n"

+      "uaddl      v3.4s, v6.4h, v3.4h            \n"

+      "uaddw2     v6.4s, v7.4s, v6.8h            \n"

+      "uaddl2     v7.4s, v5.8h, v4.8h            \n"

+      "uaddl      v4.4s, v5.4h, v4.4h            \n"

+      "uaddw      v18.4s, v18.4s, v5.4h          \n"

+      "mla        v16.4s, v4.4s, v1.4s           \n"

+      "mla        v18.4s, v3.4s, v1.4s           \n"

+      "mla        v6.4s, v7.4s, v1.4s            \n"

+      "uaddw2     v4.4s, v17.4s, v5.8h           \n"

+      "uqrshrn    v16.4h,  v16.4s, #4            \n"

+      "mla        v4.4s, v19.4s, v1.4s           \n"

+      "uqrshrn2   v16.8h, v6.4s, #4              \n"

+      "uqrshrn    v17.4h, v18.4s, #4             \n"

+      "uqrshrn2   v17.8h, v4.4s, #4              \n"

+      "st2        {v16.8h-v17.8h}, [%2], #32     \n"

+      "b.gt       1b                             \n"

+      : "+r"(src_ptr),     // %0

+        "+r"(src_stride),  // %1

+        "+r"(dst),         // %2

+        "+r"(dst_width)    // %3

+      : "r"(2LL),          // %4

+        "r"(14LL)          // %5

+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",

+        "v19"  // Clobber List

+      );

+}

 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

--- a/third_party/libyuv/source/scale_win.cc

+++ b/third_party/libyuv/source/scale_win.cc

@@ -17,97 +17,93 @@

 #endif

 // This module is for 32 bit Visual C x86 and clangcl

-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)

+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

 // Offsets for source bytes 0 to 9

-static uvec8 kShuf0 =

-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,

+                             128, 128, 128, 128, 128, 128, 128, 128};

 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

-static uvec8 kShuf1 =

-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,

+                             128, 128, 128, 128, 128, 128, 128, 128};

 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

-static uvec8 kShuf2 =

-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,

+                             128, 128, 128, 128, 128, 128, 128, 128};

 // Offsets for source bytes 0 to 10

-static uvec8 kShuf01 =

-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};

 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

-static uvec8 kShuf11 =

-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,

+                              8, 9, 9, 10, 10, 11, 12, 13};

 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

-static uvec8 kShuf21 =

-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,

+                              10, 11, 12, 13, 13, 14, 14, 15};

 // Coefficients for source bytes 0 to 10

-static uvec8 kMadd01 =

-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};

 // Coefficients for source bytes 10 to 21

-static uvec8 kMadd11 =

-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};

 // Coefficients for source bytes 21 to 31

-static uvec8 kMadd21 =

-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};

 // Coefficients for source bytes 21 to 31

-static vec16 kRound34 =

-  { 2, 2, 2, 2, 2, 2, 2, 2 };

+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};

-static uvec8 kShuf38a =

-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,

+                               128, 128, 128, 128, 128, 128, 128, 128};

-static uvec8 kShuf38b =

-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,

+                               6,   8,   11,  14,  128, 128, 128, 128};

 // Arrange words 0,3,6 into 0,1,2

-static uvec8 kShufAc =

-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,

+                              128, 128, 128, 128, 128, 128, 128, 128};

 // Arrange words 0,3,6 into 3,4,5

-static uvec8 kShufAc3 =

-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,

+                               6,   7,   12,  13,  128, 128, 128, 128};

 // Scaling values for boxes of 3x3 and 2x3

-static uvec16 kScaleAc33 =

-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,

+                                  65536 / 9, 65536 / 6, 0,         0};

 // Arrange first value for pixels 0,1,2,3,4,5

-static uvec8 kShufAb0 =

-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,

+                               11, 128, 14, 128, 128, 128, 128, 128};

 // Arrange second value for pixels 0,1,2,3,4,5

-static uvec8 kShufAb1 =

-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,

+                               12, 128, 15, 128, 128, 128, 128, 128};

 // Arrange third value for pixels 0,1,2,3,4,5

-static uvec8 kShufAb2 =

-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,

+                               13, 128, 128, 128, 128, 128, 128, 128};

 // Scaling values for boxes of 3x2 and 2x2

-static uvec16 kScaleAb2 =

-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,

+                                 65536 / 3, 65536 / 2, 0,         0};

 // Reads 32 pixels, throws half away and writes 16 pixels.

-__declspec(naked)

-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                         uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,

+                                           ptrdiff_t src_stride,

+                                           uint8_t* dst_ptr,

+                                           int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_ptr

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_ptr

-    mov        ecx, [esp + 16]       // dst_width

+    mov        eax, [esp + 4]  // src_ptr

+    // src_stride ignored

+    mov        edx, [esp + 12]  // dst_ptr

+    mov        ecx, [esp + 16]  // dst_width

   wloop:

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

-    psrlw      xmm0, 8               // isolate odd pixels.

+    psrlw      xmm0, 8          // isolate odd pixels.

     psrlw      xmm1, 8

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

@@ -120,27 +116,28 @@

 // Blends 32x1 rectangle to 16x1.

-__declspec(naked)

-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                               uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,

+                                                 ptrdiff_t src_stride,

+                                                 uint8_t* dst_ptr,

+                                                 int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_ptr

-                                     // src_stride

-    mov        edx, [esp + 12]       // dst_ptr

-    mov        ecx, [esp + 16]       // dst_width

+    mov        eax, [esp + 4]  // src_ptr

+    // src_stride

+    mov        edx, [esp + 12]  // dst_ptr

+    mov        ecx, [esp + 16]  // dst_width

-    pcmpeqb    xmm4, xmm4            // constant 0x0101

+    pcmpeqb    xmm4, xmm4  // constant 0x0101

     psrlw      xmm4, 15

     packuswb   xmm4, xmm4

-    pxor       xmm5, xmm5            // constant 0

+    pxor       xmm5, xmm5  // constant 0

   wloop:

     movdqu     xmm0, [eax]

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

-    pmaddubsw  xmm0, xmm4      // horizontal add

+    pmaddubsw  xmm0, xmm4  // horizontal add

     pmaddubsw  xmm1, xmm4

-    pavgw      xmm0, xmm5      // (x + 1) / 2

+    pavgw      xmm0, xmm5       // (x + 1) / 2

     pavgw      xmm1, xmm5

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

@@ -153,20 +150,21 @@

 // Blends 32x2 rectangle to 16x1.

-__declspec(naked)

-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                            uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,

+                                              ptrdiff_t src_stride,

+                                              uint8_t* dst_ptr,

+                                              int dst_width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]    // src_ptr

-    mov        esi, [esp + 4 + 8]    // src_stride

-    mov        edx, [esp + 4 + 12]   // dst_ptr

-    mov        ecx, [esp + 4 + 16]   // dst_width

+    mov        eax, [esp + 4 + 4]  // src_ptr

+    mov        esi, [esp + 4 + 8]  // src_stride

+    mov        edx, [esp + 4 + 12]  // dst_ptr

+    mov        ecx, [esp + 4 + 16]  // dst_width

-    pcmpeqb    xmm4, xmm4            // constant 0x0101

+    pcmpeqb    xmm4, xmm4  // constant 0x0101

     psrlw      xmm4, 15

     packuswb   xmm4, xmm4

-    pxor       xmm5, xmm5            // constant 0

+    pxor       xmm5, xmm5  // constant 0

   wloop:

     movdqu     xmm0, [eax]

@@ -174,15 +172,15 @@

     movdqu     xmm2, [eax + esi]

     movdqu     xmm3, [eax + esi + 16]

     lea        eax,  [eax + 32]

-    pmaddubsw  xmm0, xmm4      // horizontal add

+    pmaddubsw  xmm0, xmm4  // horizontal add

     pmaddubsw  xmm1, xmm4

     pmaddubsw  xmm2, xmm4

     pmaddubsw  xmm3, xmm4

-    paddw      xmm0, xmm2      // vertical add

+    paddw      xmm0, xmm2  // vertical add

     paddw      xmm1, xmm3

     psrlw      xmm0, 1

     psrlw      xmm1, 1

-    pavgw      xmm0, xmm5      // (x + 1) / 2

+    pavgw      xmm0, xmm5  // (x + 1) / 2

     pavgw      xmm1, xmm5

     packuswb   xmm0, xmm1

     movdqu     [edx], xmm0

@@ -197,23 +195,24 @@

 #ifdef HAS_SCALEROWDOWN2_AVX2

 // Reads 64 pixels, throws half away and writes 32 pixels.

-__declspec(naked)

-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,

+                                          ptrdiff_t src_stride,

+                                          uint8_t* dst_ptr,

+                                          int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_ptr

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_ptr

-    mov        ecx, [esp + 16]       // dst_width

+    mov        eax, [esp + 4]  // src_ptr

+    // src_stride ignored

+    mov        edx, [esp + 12]  // dst_ptr

+    mov        ecx, [esp + 16]  // dst_width

   wloop:

     vmovdqu     ymm0, [eax]

     vmovdqu     ymm1, [eax + 32]

     lea         eax,  [eax + 64]

-    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.

+    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.

     vpsrlw      ymm1, ymm1, 8

     vpackuswb   ymm0, ymm0, ymm1

-    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb

+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb

     vmovdqu     [edx], ymm0

     lea         edx, [edx + 32]

     sub         ecx, 32

@@ -225,30 +224,31 @@

 // Blends 64x1 rectangle to 32x1.

-__declspec(naked)

-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                              uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,

+                                                ptrdiff_t src_stride,

+                                                uint8_t* dst_ptr,

+                                                int dst_width) {

   __asm {

-    mov         eax, [esp + 4]        // src_ptr

-                                      // src_stride

-    mov         edx, [esp + 12]       // dst_ptr

-    mov         ecx, [esp + 16]       // dst_width

+    mov         eax, [esp + 4]  // src_ptr

+    // src_stride

+    mov         edx, [esp + 12]  // dst_ptr

+    mov         ecx, [esp + 16]  // dst_width

-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b

+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b

     vpsrlw      ymm4, ymm4, 15

     vpackuswb   ymm4, ymm4, ymm4

-    vpxor       ymm5, ymm5, ymm5      // constant 0

+    vpxor       ymm5, ymm5, ymm5  // constant 0

   wloop:

     vmovdqu     ymm0, [eax]

     vmovdqu     ymm1, [eax + 32]

     lea         eax,  [eax + 64]

-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add

+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add

     vpmaddubsw  ymm1, ymm1, ymm4

-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2

+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2

     vpavgw      ymm1, ymm1, ymm5

     vpackuswb   ymm0, ymm0, ymm1

-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb

     vmovdqu     [edx], ymm0

     lea         edx, [edx + 32]

     sub         ecx, 32

@@ -262,20 +262,21 @@

 // For rounding, average = (sum + 2) / 4

 // becomes average((sum >> 1), 0)

 // Blends 64x2 rectangle to 32x1.

-__declspec(naked)

-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,

+                                             ptrdiff_t src_stride,

+                                             uint8_t* dst_ptr,

+                                             int dst_width) {

   __asm {

     push        esi

-    mov         eax, [esp + 4 + 4]    // src_ptr

-    mov         esi, [esp + 4 + 8]    // src_stride

-    mov         edx, [esp + 4 + 12]   // dst_ptr

-    mov         ecx, [esp + 4 + 16]   // dst_width

+    mov         eax, [esp + 4 + 4]  // src_ptr

+    mov         esi, [esp + 4 + 8]  // src_stride

+    mov         edx, [esp + 4 + 12]  // dst_ptr

+    mov         ecx, [esp + 4 + 16]  // dst_width

-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b

+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b

     vpsrlw      ymm4, ymm4, 15

     vpackuswb   ymm4, ymm4, ymm4

-    vpxor       ymm5, ymm5, ymm5      // constant 0

+    vpxor       ymm5, ymm5, ymm5  // constant 0

   wloop:

     vmovdqu     ymm0, [eax]

@@ -283,18 +284,18 @@

     vmovdqu     ymm2, [eax + esi]

     vmovdqu     ymm3, [eax + esi + 32]

     lea         eax,  [eax + 64]

-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add

+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add

     vpmaddubsw  ymm1, ymm1, ymm4

     vpmaddubsw  ymm2, ymm2, ymm4

     vpmaddubsw  ymm3, ymm3, ymm4

-    vpaddw      ymm0, ymm0, ymm2      // vertical add

+    vpaddw      ymm0, ymm0, ymm2  // vertical add

     vpaddw      ymm1, ymm1, ymm3

-    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2

+    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2

     vpsrlw      ymm1, ymm1, 1

-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2

+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2

     vpavgw      ymm1, ymm1, ymm5

     vpackuswb   ymm0, ymm0, ymm1

-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb

     vmovdqu     [edx], ymm0

     lea         edx, [edx + 32]

     sub         ecx, 32

@@ -308,15 +309,16 @@

 #endif  // HAS_SCALEROWDOWN2_AVX2

 // Point samples 32 pixels to 8 pixels.

-__declspec(naked)

-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,

+                                           ptrdiff_t src_stride,

+                                           uint8_t* dst_ptr,

+                                           int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_ptr

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_ptr

-    mov        ecx, [esp + 16]       // dst_width

-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000

+    mov        eax, [esp + 4]  // src_ptr

+    // src_stride ignored

+    mov        edx, [esp + 12]  // dst_ptr

+    mov        ecx, [esp + 16]  // dst_width

+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000

     psrld      xmm5, 24

     pslld      xmm5, 16

@@ -339,39 +341,40 @@

 // Blends 32x4 rectangle to 8x1.

-__declspec(naked)

-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,

+                                              ptrdiff_t src_stride,

+                                              uint8_t* dst_ptr,

+                                              int dst_width) {

   __asm {

     push       esi

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_ptr

-    mov        esi, [esp + 8 + 8]    // src_stride

-    mov        edx, [esp + 8 + 12]   // dst_ptr

-    mov        ecx, [esp + 8 + 16]   // dst_width

+    mov        eax, [esp + 8 + 4]  // src_ptr

+    mov        esi, [esp + 8 + 8]  // src_stride

+    mov        edx, [esp + 8 + 12]  // dst_ptr

+    mov        ecx, [esp + 8 + 16]  // dst_width

     lea        edi, [esi + esi * 2]  // src_stride * 3

-    pcmpeqb    xmm4, xmm4            // constant 0x0101

+    pcmpeqb    xmm4, xmm4  // constant 0x0101

     psrlw      xmm4, 15

     movdqa     xmm5, xmm4

     packuswb   xmm4, xmm4

-    psllw      xmm5, 3               // constant 0x0008

+    psllw      xmm5, 3  // constant 0x0008

   wloop:

-    movdqu     xmm0, [eax]           // average rows

+    movdqu     xmm0, [eax]  // average rows

     movdqu     xmm1, [eax + 16]

     movdqu     xmm2, [eax + esi]

     movdqu     xmm3, [eax + esi + 16]

-    pmaddubsw  xmm0, xmm4      // horizontal add

+    pmaddubsw  xmm0, xmm4  // horizontal add

     pmaddubsw  xmm1, xmm4

     pmaddubsw  xmm2, xmm4

     pmaddubsw  xmm3, xmm4

-    paddw      xmm0, xmm2      // vertical add rows 0, 1

+    paddw      xmm0, xmm2  // vertical add rows 0, 1

     paddw      xmm1, xmm3

     movdqu     xmm2, [eax + esi * 2]

     movdqu     xmm3, [eax + esi * 2 + 16]

     pmaddubsw  xmm2, xmm4

     pmaddubsw  xmm3, xmm4

-    paddw      xmm0, xmm2      // add row 2

+    paddw      xmm0, xmm2  // add row 2

     paddw      xmm1, xmm3

     movdqu     xmm2, [eax + edi]

     movdqu     xmm3, [eax + edi + 16]

@@ -378,11 +381,11 @@

     lea        eax, [eax + 32]

     pmaddubsw  xmm2, xmm4

     pmaddubsw  xmm3, xmm4

-    paddw      xmm0, xmm2      // add row 3

+    paddw      xmm0, xmm2  // add row 3

     paddw      xmm1, xmm3

     phaddw     xmm0, xmm1

-    paddw      xmm0, xmm5      // + 8 for round

-    psrlw      xmm0, 4         // /16 for average of 4 * 4

+    paddw      xmm0, xmm5  // + 8 for round

+    psrlw      xmm0, 4  // /16 for average of 4 * 4

     packuswb   xmm0, xmm0

     movq       qword ptr [edx], xmm0

     lea        edx, [edx + 8]

@@ -397,15 +400,16 @@

 #ifdef HAS_SCALEROWDOWN4_AVX2

 // Point samples 64 pixels to 16 pixels.

-__declspec(naked)

-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                        uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,

+                                          ptrdiff_t src_stride,

+                                          uint8_t* dst_ptr,

+                                          int dst_width) {

   __asm {

-    mov         eax, [esp + 4]        // src_ptr

-                                      // src_stride ignored

-    mov         edx, [esp + 12]       // dst_ptr

-    mov         ecx, [esp + 16]       // dst_width

-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000

+    mov         eax, [esp + 4]  // src_ptr

+    // src_stride ignored

+    mov         edx, [esp + 12]  // dst_ptr

+    mov         ecx, [esp + 16]  // dst_width

+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000

     vpsrld      ymm5, ymm5, 24

     vpslld      ymm5, ymm5, 16

@@ -416,10 +420,10 @@

     vpand       ymm0, ymm0, ymm5

     vpand       ymm1, ymm1, ymm5

     vpackuswb   ymm0, ymm0, ymm1

-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb

     vpsrlw      ymm0, ymm0, 8

     vpackuswb   ymm0, ymm0, ymm0

-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb

     vmovdqu     [edx], xmm0

     lea         edx, [edx + 16]

     sub         ecx, 16

@@ -431,38 +435,39 @@

 // Blends 64x4 rectangle to 16x1.

-__declspec(naked)

-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,

-                           uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,

+                                             ptrdiff_t src_stride,

+                                             uint8_t* dst_ptr,

+                                             int dst_width) {

   __asm {

     push        esi

     push        edi

-    mov         eax, [esp + 8 + 4]    // src_ptr

-    mov         esi, [esp + 8 + 8]    // src_stride

-    mov         edx, [esp + 8 + 12]   // dst_ptr

-    mov         ecx, [esp + 8 + 16]   // dst_width

+    mov         eax, [esp + 8 + 4]  // src_ptr

+    mov         esi, [esp + 8 + 8]  // src_stride

+    mov         edx, [esp + 8 + 12]  // dst_ptr

+    mov         ecx, [esp + 8 + 16]  // dst_width

     lea         edi, [esi + esi * 2]  // src_stride * 3

-    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101

+    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101

     vpsrlw      ymm4, ymm4, 15

-    vpsllw      ymm5, ymm4, 3               // constant 0x0008

+    vpsllw      ymm5, ymm4, 3  // constant 0x0008

     vpackuswb   ymm4, ymm4, ymm4

   wloop:

-    vmovdqu     ymm0, [eax]           // average rows

+    vmovdqu     ymm0, [eax]  // average rows

     vmovdqu     ymm1, [eax + 32]

     vmovdqu     ymm2, [eax + esi]

     vmovdqu     ymm3, [eax + esi + 32]

-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add

+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add

     vpmaddubsw  ymm1, ymm1, ymm4

     vpmaddubsw  ymm2, ymm2, ymm4

     vpmaddubsw  ymm3, ymm3, ymm4

-    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1

+    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1

     vpaddw      ymm1, ymm1, ymm3

     vmovdqu     ymm2, [eax + esi * 2]

     vmovdqu     ymm3, [eax + esi * 2 + 32]

     vpmaddubsw  ymm2, ymm2, ymm4

     vpmaddubsw  ymm3, ymm3, ymm4

-    vpaddw      ymm0, ymm0, ymm2      // add row 2

+    vpaddw      ymm0, ymm0, ymm2  // add row 2

     vpaddw      ymm1, ymm1, ymm3

     vmovdqu     ymm2, [eax + edi]

     vmovdqu     ymm3, [eax + edi + 32]

@@ -469,14 +474,14 @@

     lea         eax,  [eax + 64]

     vpmaddubsw  ymm2, ymm2, ymm4

     vpmaddubsw  ymm3, ymm3, ymm4

-    vpaddw      ymm0, ymm0, ymm2      // add row 3

+    vpaddw      ymm0, ymm0, ymm2  // add row 3

     vpaddw      ymm1, ymm1, ymm3

-    vphaddw     ymm0, ymm0, ymm1      // mutates

-    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw

-    vpaddw      ymm0, ymm0, ymm5      // + 8 for round

-    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4

+    vphaddw     ymm0, ymm0, ymm1  // mutates

+    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw

+    vpaddw      ymm0, ymm0, ymm5  // + 8 for round

+    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4

     vpackuswb   ymm0, ymm0, ymm0

-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb

+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb

     vmovdqu     [edx], xmm0

     lea         edx, [edx + 16]

     sub         ecx, 16

@@ -494,14 +499,15 @@

 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

 // Then shuffled to do the scaling.

-__declspec(naked)

-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,

+                                            ptrdiff_t src_stride,

+                                            uint8_t* dst_ptr,

+                                            int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_ptr

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_ptr

-    mov        ecx, [esp + 16]       // dst_width

+    mov        eax, [esp + 4]   // src_ptr

+    // src_stride ignored

+    mov        edx, [esp + 12]  // dst_ptr

+    mov        ecx, [esp + 16]  // dst_width

     movdqa     xmm3, xmmword ptr kShuf0

     movdqa     xmm4, xmmword ptr kShuf1

     movdqa     xmm5, xmmword ptr kShuf2

@@ -541,16 +547,16 @@

 // xmm7 kRound34

 // Note that movdqa+palign may be better than movdqu.

-__declspec(naked)

-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

-                                ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,

+                                                  ptrdiff_t src_stride,

+                                                  uint8_t* dst_ptr,

+                                                  int dst_width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]    // src_ptr

-    mov        esi, [esp + 4 + 8]    // src_stride

-    mov        edx, [esp + 4 + 12]   // dst_ptr

-    mov        ecx, [esp + 4 + 16]   // dst_width

+    mov        eax, [esp + 4 + 4]  // src_ptr

+    mov        esi, [esp + 4 + 8]  // src_stride

+    mov        edx, [esp + 4 + 12]  // dst_ptr

+    mov        ecx, [esp + 4 + 16]  // dst_width

     movdqa     xmm2, xmmword ptr kShuf01

     movdqa     xmm3, xmmword ptr kShuf11

     movdqa     xmm4, xmmword ptr kShuf21

@@ -559,7 +565,7 @@

     movdqa     xmm7, xmmword ptr kRound34

   wloop:

-    movdqu     xmm0, [eax]           // pixels 0..7

+    movdqu     xmm0, [eax]  // pixels 0..7

     movdqu     xmm1, [eax + esi]

     pavgb      xmm0, xmm1

     pshufb     xmm0, xmm2

@@ -568,7 +574,7 @@

     psrlw      xmm0, 2

     packuswb   xmm0, xmm0

     movq       qword ptr [edx], xmm0

-    movdqu     xmm0, [eax + 8]       // pixels 8..15

+    movdqu     xmm0, [eax + 8]  // pixels 8..15

     movdqu     xmm1, [eax + esi + 8]

     pavgb      xmm0, xmm1

     pshufb     xmm0, xmm3

@@ -577,7 +583,7 @@

     psrlw      xmm0, 2

     packuswb   xmm0, xmm0

     movq       qword ptr [edx + 8], xmm0

-    movdqu     xmm0, [eax + 16]      // pixels 16..23

+    movdqu     xmm0, [eax + 16]  // pixels 16..23

     movdqu     xmm1, [eax + esi + 16]

     lea        eax, [eax + 32]

     pavgb      xmm0, xmm1

@@ -598,16 +604,16 @@

 // Note that movdqa+palign may be better than movdqu.

-__declspec(naked)

-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

-                                ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,

+                                                  ptrdiff_t src_stride,

+                                                  uint8_t* dst_ptr,

+                                                  int dst_width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]    // src_ptr

-    mov        esi, [esp + 4 + 8]    // src_stride

-    mov        edx, [esp + 4 + 12]   // dst_ptr

-    mov        ecx, [esp + 4 + 16]   // dst_width

+    mov        eax, [esp + 4 + 4]  // src_ptr

+    mov        esi, [esp + 4 + 8]  // src_stride

+    mov        edx, [esp + 4 + 12]  // dst_ptr

+    mov        ecx, [esp + 4 + 16]  // dst_width

     movdqa     xmm2, xmmword ptr kShuf01

     movdqa     xmm3, xmmword ptr kShuf11

     movdqa     xmm4, xmmword ptr kShuf21

@@ -616,7 +622,7 @@

     movdqa     xmm7, xmmword ptr kRound34

   wloop:

-    movdqu     xmm0, [eax]           // pixels 0..7

+    movdqu     xmm0, [eax]  // pixels 0..7

     movdqu     xmm1, [eax + esi]

     pavgb      xmm1, xmm0

     pavgb      xmm0, xmm1

@@ -626,7 +632,7 @@

     psrlw      xmm0, 2

     packuswb   xmm0, xmm0

     movq       qword ptr [edx], xmm0

-    movdqu     xmm0, [eax + 8]       // pixels 8..15

+    movdqu     xmm0, [eax + 8]  // pixels 8..15

     movdqu     xmm1, [eax + esi + 8]

     pavgb      xmm1, xmm0

     pavgb      xmm0, xmm1

@@ -636,7 +642,7 @@

     psrlw      xmm0, 2

     packuswb   xmm0, xmm0

     movq       qword ptr [edx + 8], xmm0

-    movdqu     xmm0, [eax + 16]      // pixels 16..23

+    movdqu     xmm0, [eax + 16]  // pixels 16..23

     movdqu     xmm1, [eax + esi + 16]

     lea        eax, [eax + 32]

     pavgb      xmm1, xmm0

@@ -660,26 +666,27 @@

 // 3/8 point sampler

 // Scale 32 pixels to 12

-__declspec(naked)

-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

-                          uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,

+                                            ptrdiff_t src_stride,

+                                            uint8_t* dst_ptr,

+                                            int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_ptr

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_ptr

-    mov        ecx, [esp + 16]       // dst_width

+    mov        eax, [esp + 4]  // src_ptr

+    // src_stride ignored

+    mov        edx, [esp + 12]  // dst_ptr

+    mov        ecx, [esp + 16]  // dst_width

     movdqa     xmm4, xmmword ptr kShuf38a

     movdqa     xmm5, xmmword ptr kShuf38b

   xloop:

-    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5

-    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11

+    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5

+    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11

     lea        eax, [eax + 32]

     pshufb     xmm0, xmm4

     pshufb     xmm1, xmm5

     paddusb    xmm0, xmm1

-    movq       qword ptr [edx], xmm0  // write 12 pixels

+    movq       qword ptr [edx], xmm0       // write 12 pixels

     movhlps    xmm1, xmm0

     movd       [edx + 8], xmm1

     lea        edx, [edx + 12]

@@ -691,16 +698,16 @@

 // Scale 16x3 pixels to 6x1 with interpolation

-__declspec(naked)

-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

-                                ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,

+                                                  ptrdiff_t src_stride,

+                                                  uint8_t* dst_ptr,

+                                                  int dst_width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]    // src_ptr

-    mov        esi, [esp + 4 + 8]    // src_stride

-    mov        edx, [esp + 4 + 12]   // dst_ptr

-    mov        ecx, [esp + 4 + 16]   // dst_width

+    mov        eax, [esp + 4 + 4]  // src_ptr

+    mov        esi, [esp + 4 + 8]  // src_stride

+    mov        edx, [esp + 4 + 12]  // dst_ptr

+    mov        ecx, [esp + 4 + 16]  // dst_width

     movdqa     xmm2, xmmword ptr kShufAc

     movdqa     xmm3, xmmword ptr kShufAc3

     movdqa     xmm4, xmmword ptr kScaleAc33

@@ -707,7 +714,7 @@

     pxor       xmm5, xmm5

   xloop:

-    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1

+    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1

     movdqu     xmm6, [eax + esi]

     movhlps    xmm1, xmm0

     movhlps    xmm7, xmm6

@@ -725,7 +732,7 @@

     paddusw    xmm0, xmm6

     paddusw    xmm1, xmm7

-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6

+    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6

     psrldq     xmm0, 2

     paddusw    xmm6, xmm0

     psrldq     xmm0, 2

@@ -732,7 +739,7 @@

     paddusw    xmm6, xmm0

     pshufb     xmm6, xmm2

-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6

+    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6

     psrldq     xmm1, 2

     paddusw    xmm7, xmm1

     psrldq     xmm1, 2

@@ -740,10 +747,10 @@

     pshufb     xmm7, xmm3

     paddusw    xmm6, xmm7

-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6

+    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6

     packuswb   xmm6, xmm6

-    movd       [edx], xmm6           // write 6 pixels

+    movd       [edx], xmm6  // write 6 pixels

     psrlq      xmm6, 16

     movd       [edx + 2], xmm6

     lea        edx, [edx + 6]

@@ -756,16 +763,16 @@

 // Scale 16x2 pixels to 6x1 with interpolation

-__declspec(naked)

-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

-                                ptrdiff_t src_stride,

-                                uint8* dst_ptr, int dst_width) {

+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,

+                                                  ptrdiff_t src_stride,

+                                                  uint8_t* dst_ptr,

+                                                  int dst_width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]    // src_ptr

-    mov        esi, [esp + 4 + 8]    // src_stride

-    mov        edx, [esp + 4 + 12]   // dst_ptr

-    mov        ecx, [esp + 4 + 16]   // dst_width

+    mov        eax, [esp + 4 + 4]  // src_ptr

+    mov        esi, [esp + 4 + 8]  // src_stride

+    mov        edx, [esp + 4 + 12]  // dst_ptr

+    mov        ecx, [esp + 4 + 16]  // dst_width

     movdqa     xmm2, xmmword ptr kShufAb0

     movdqa     xmm3, xmmword ptr kShufAb1

     movdqa     xmm4, xmmword ptr kShufAb2

@@ -772,12 +779,12 @@

     movdqa     xmm5, xmmword ptr kScaleAb2

   xloop:

-    movdqu     xmm0, [eax]           // average 2 rows into xmm0

+    movdqu     xmm0, [eax]  // average 2 rows into xmm0

     movdqu     xmm1, [eax + esi]

     lea        eax, [eax + 16]

     pavgb      xmm0, xmm1

-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1

+    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1

     pshufb     xmm1, xmm2

     movdqa     xmm6, xmm0

     pshufb     xmm6, xmm3

@@ -785,10 +792,10 @@

     pshufb     xmm0, xmm4

     paddusw    xmm1, xmm0

-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2

+    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2

     packuswb   xmm1, xmm1

-    movd       [edx], xmm1           // write 6 pixels

+    movd       [edx], xmm1  // write 6 pixels

     psrlq      xmm1, 16

     movd       [edx + 2], xmm1

     lea        edx, [edx + 6]

@@ -801,26 +808,27 @@

 // Reads 16 bytes and accumulates to 16 shorts at a time.

-__declspec(naked)

-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

+__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,

+                                        uint16_t* dst_ptr,

+                                        int src_width) {

   __asm {

-    mov        eax, [esp + 4]   // src_ptr

-    mov        edx, [esp + 8]   // dst_ptr

+    mov        eax, [esp + 4]  // src_ptr

+    mov        edx, [esp + 8]  // dst_ptr

     mov        ecx, [esp + 12]  // src_width

     pxor       xmm5, xmm5

-  // sum rows

+        // sum rows

   xloop:

-    movdqu     xmm3, [eax]       // read 16 bytes

+    movdqu     xmm3, [eax]  // read 16 bytes

     lea        eax, [eax + 16]

-    movdqu     xmm0, [edx]       // read 16 words from destination

+    movdqu     xmm0, [edx]  // read 16 words from destination

     movdqu     xmm1, [edx + 16]

     movdqa     xmm2, xmm3

     punpcklbw  xmm2, xmm5

     punpckhbw  xmm3, xmm5

-    paddusw    xmm0, xmm2        // sum 16 words

+    paddusw    xmm0, xmm2  // sum 16 words

     paddusw    xmm1, xmm3

-    movdqu     [edx], xmm0       // write 16 words to destination

+    movdqu     [edx], xmm0  // write 16 words to destination

     movdqu     [edx + 16], xmm1

     lea        edx, [edx + 32]

     sub        ecx, 16

@@ -831,24 +839,25 @@

 #ifdef HAS_SCALEADDROW_AVX2

 // Reads 32 bytes and accumulates to 32 shorts at a time.

-__declspec(naked)

-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {

+__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,

+                                        uint16_t* dst_ptr,

+                                        int src_width) {

   __asm {

-    mov         eax, [esp + 4]   // src_ptr

-    mov         edx, [esp + 8]   // dst_ptr

+    mov         eax, [esp + 4]  // src_ptr

+    mov         edx, [esp + 8]  // dst_ptr

     mov         ecx, [esp + 12]  // src_width

     vpxor       ymm5, ymm5, ymm5

-  // sum rows

+        // sum rows

   xloop:

-    vmovdqu     ymm3, [eax]       // read 32 bytes

+    vmovdqu     ymm3, [eax]  // read 32 bytes

     lea         eax, [eax + 32]

     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck

     vpunpcklbw  ymm2, ymm3, ymm5

     vpunpckhbw  ymm3, ymm3, ymm5

-    vpaddusw    ymm0, ymm2, [edx] // sum 16 words

+    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words

     vpaddusw    ymm1, ymm3, [edx + 32]

-    vmovdqu     [edx], ymm0       // write 32 words to destination

+    vmovdqu     [edx], ymm0  // write 32 words to destination

     vmovdqu     [edx + 32], ymm1

     lea         edx, [edx + 64]

     sub         ecx, 32

@@ -862,68 +871,69 @@

 // Constant for making pixels signed to avoid pmaddubsw

 // saturation.

-static uvec8 kFsub80 =

-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };

+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,

+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};

 // Constant for making pixels unsigned and adding .5 for rounding.

-static uvec16 kFadd40 =

-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };

+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,

+                               0x4040, 0x4040, 0x4040, 0x4040};

 // Bilinear column filtering. SSSE3 version.

-__declspec(naked)

-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

-                           int dst_width, int x, int dx) {

+__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,

+                                             const uint8_t* src_ptr,

+                                             int dst_width,

+                                             int x,

+                                             int dx) {

   __asm {

     push       ebx

     push       esi

     push       edi

-    mov        edi, [esp + 12 + 4]    // dst_ptr

-    mov        esi, [esp + 12 + 8]    // src_ptr

-    mov        ecx, [esp + 12 + 12]   // dst_width

+    mov        edi, [esp + 12 + 4]  // dst_ptr

+    mov        esi, [esp + 12 + 8]  // src_ptr

+    mov        ecx, [esp + 12 + 12]  // dst_width

     movd       xmm2, [esp + 12 + 16]  // x

     movd       xmm3, [esp + 12 + 20]  // dx

-    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.

+    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.

     movd       xmm5, eax

-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.

     psrlw      xmm6, 9

-    pcmpeqb    xmm7, xmm7           // generate 0x0001

+    pcmpeqb    xmm7, xmm7  // generate 0x0001

     psrlw      xmm7, 15

-    pextrw     eax, xmm2, 1         // get x0 integer. preroll

+    pextrw     eax, xmm2, 1  // get x0 integer. preroll

     sub        ecx, 2

     jl         xloop29

-    movdqa     xmm0, xmm2           // x1 = x0 + dx

+    movdqa     xmm0, xmm2  // x1 = x0 + dx

     paddd      xmm0, xmm3

-    punpckldq  xmm2, xmm0           // x0 x1

-    punpckldq  xmm3, xmm3           // dx dx

-    paddd      xmm3, xmm3           // dx * 2, dx * 2

-    pextrw     edx, xmm2, 3         // get x1 integer. preroll

+    punpckldq  xmm2, xmm0  // x0 x1

+    punpckldq  xmm3, xmm3  // dx dx

+    paddd      xmm3, xmm3  // dx * 2, dx * 2

+    pextrw     edx, xmm2, 3  // get x1 integer. preroll

     // 2 Pixel loop.

   xloop2:

-    movdqa     xmm1, xmm2           // x0, x1 fractions.

-    paddd      xmm2, xmm3           // x += dx

+    movdqa     xmm1, xmm2  // x0, x1 fractions.

+    paddd      xmm2, xmm3  // x += dx

     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

     movd       xmm0, ebx

-    psrlw      xmm1, 9              // 7 bit fractions.

+    psrlw      xmm1, 9  // 7 bit fractions.

     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels

     movd       xmm4, ebx

-    pshufb     xmm1, xmm5           // 0011

+    pshufb     xmm1, xmm5  // 0011

     punpcklwd  xmm0, xmm4

     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.

-    pxor       xmm1, xmm6           // 0..7f and 7f..0

-    paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1

-    pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.

-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.

-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.

+    pxor       xmm1, xmm6  // 0..7f and 7f..0

+    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1

+    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.

+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.

+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.

     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.

-    psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.

-    packuswb   xmm1, xmm1           // 8 bits, 2 pixels.

+    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.

+    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.

     movd       ebx, xmm1

     mov        [edi], bx

     lea        edi, [edi + 2]

-    sub        ecx, 2               // 2 pixels

+    sub        ecx, 2  // 2 pixels

     jge        xloop2

  xloop29:

@@ -930,18 +940,18 @@

     add        ecx, 2 - 1

     jl         xloop99

-    // 1 pixel remainder

+            // 1 pixel remainder

     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

     movd       xmm0, ebx

-    psrlw      xmm2, 9              // 7 bit fractions.

-    pshufb     xmm2, xmm5           // 0011

+    psrlw      xmm2, 9  // 7 bit fractions.

+    pshufb     xmm2, xmm5  // 0011

     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.

-    pxor       xmm2, xmm6           // 0..7f and 7f..0

-    paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1

-    pmaddubsw  xmm2, xmm0           // 16 bit

+    pxor       xmm2, xmm6  // 0..7f and 7f..0

+    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1

+    pmaddubsw  xmm2, xmm0  // 16 bit

     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.

-    psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.

-    packuswb   xmm2, xmm2           // 8 bits

+    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.

+    packuswb   xmm2, xmm2  // 8 bits

     movd       ebx, xmm2

     mov        [edi], bl

@@ -955,13 +965,15 @@

 // Reads 16 pixels, duplicates them and writes 32 pixels.

-__declspec(naked)

-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

-                       int dst_width, int x, int dx) {

+__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,

+                                         const uint8_t* src_ptr,

+                                         int dst_width,

+                                         int x,

+                                         int dx) {

   __asm {

-    mov        edx, [esp + 4]    // dst_ptr

-    mov        eax, [esp + 8]    // src_ptr

-    mov        ecx, [esp + 12]   // dst_width

+    mov        edx, [esp + 4]  // dst_ptr

+    mov        eax, [esp + 8]  // src_ptr

+    mov        ecx, [esp + 12]  // dst_width

   wloop:

     movdqu     xmm0, [eax]

@@ -980,15 +992,15 @@

 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)

-__declspec(naked)

-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

-                            ptrdiff_t src_stride,

-                            uint8* dst_argb, int dst_width) {

+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,

+                                              ptrdiff_t src_stride,

+                                              uint8_t* dst_argb,

+                                              int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_argb

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_argb

-    mov        ecx, [esp + 16]       // dst_width

+    mov        eax, [esp + 4]   // src_argb

+    // src_stride ignored

+    mov        edx, [esp + 12]  // dst_argb

+    mov        ecx, [esp + 16]  // dst_width

   wloop:

     movdqu     xmm0, [eax]

@@ -1005,15 +1017,15 @@

 // Blends 8x1 rectangle to 4x1.

-__declspec(naked)

-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

-                                  ptrdiff_t src_stride,

-                                  uint8* dst_argb, int dst_width) {

+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,

+                                                    ptrdiff_t src_stride,

+                                                    uint8_t* dst_argb,

+                                                    int dst_width) {

   __asm {

-    mov        eax, [esp + 4]        // src_argb

-                                     // src_stride ignored

-    mov        edx, [esp + 12]       // dst_argb

-    mov        ecx, [esp + 16]       // dst_width

+    mov        eax, [esp + 4]  // src_argb

+    // src_stride ignored

+    mov        edx, [esp + 12]  // dst_argb

+    mov        ecx, [esp + 16]  // dst_width

   wloop:

     movdqu     xmm0, [eax]

@@ -1020,8 +1032,8 @@

     movdqu     xmm1, [eax + 16]

     lea        eax,  [eax + 32]

     movdqa     xmm2, xmm0

-    shufps     xmm0, xmm1, 0x88      // even pixels

-    shufps     xmm2, xmm1, 0xdd      // odd pixels

+    shufps     xmm0, xmm1, 0x88  // even pixels

+    shufps     xmm2, xmm1, 0xdd       // odd pixels

     pavgb      xmm0, xmm2

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

@@ -1033,16 +1045,16 @@

 // Blends 8x2 rectangle to 4x1.

-__declspec(naked)

-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

-                               ptrdiff_t src_stride,

-                               uint8* dst_argb, int dst_width) {

+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,

+                                                 ptrdiff_t src_stride,

+                                                 uint8_t* dst_argb,

+                                                 int dst_width) {

   __asm {

     push       esi

-    mov        eax, [esp + 4 + 4]    // src_argb

-    mov        esi, [esp + 4 + 8]    // src_stride

-    mov        edx, [esp + 4 + 12]   // dst_argb

-    mov        ecx, [esp + 4 + 16]   // dst_width

+    mov        eax, [esp + 4 + 4]  // src_argb

+    mov        esi, [esp + 4 + 8]  // src_stride

+    mov        edx, [esp + 4 + 12]  // dst_argb

+    mov        ecx, [esp + 4 + 16]  // dst_width

   wloop:

     movdqu     xmm0, [eax]

@@ -1050,11 +1062,11 @@

     movdqu     xmm2, [eax + esi]

     movdqu     xmm3, [eax + esi + 16]

     lea        eax,  [eax + 32]

-    pavgb      xmm0, xmm2            // average rows

+    pavgb      xmm0, xmm2  // average rows

     pavgb      xmm1, xmm3

-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)

-    shufps     xmm0, xmm1, 0x88      // even pixels

-    shufps     xmm2, xmm1, 0xdd      // odd pixels

+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)

+    shufps     xmm0, xmm1, 0x88  // even pixels

+    shufps     xmm2, xmm1, 0xdd  // odd pixels

     pavgb      xmm0, xmm2

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

@@ -1067,18 +1079,19 @@

 // Reads 4 pixels at a time.

-__declspec(naked)

-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

-                               int src_stepx,

-                               uint8* dst_argb, int dst_width) {

+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,

+                                                 ptrdiff_t src_stride,

+                                                 int src_stepx,

+                                                 uint8_t* dst_argb,

+                                                 int dst_width) {

   __asm {

     push       ebx

     push       edi

-    mov        eax, [esp + 8 + 4]    // src_argb

-                                     // src_stride ignored

-    mov        ebx, [esp + 8 + 12]   // src_stepx

-    mov        edx, [esp + 8 + 16]   // dst_argb

-    mov        ecx, [esp + 8 + 20]   // dst_width

+    mov        eax, [esp + 8 + 4]   // src_argb

+    // src_stride ignored

+    mov        ebx, [esp + 8 + 12]  // src_stepx

+    mov        edx, [esp + 8 + 16]  // dst_argb

+    mov        ecx, [esp + 8 + 20]  // dst_width

     lea        ebx, [ebx * 4]

     lea        edi, [ebx + ebx * 2]

@@ -1103,21 +1116,21 @@

 // Blends four 2x2 to 4x1.

-__declspec(naked)

-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

-                                  ptrdiff_t src_stride,

-                                  int src_stepx,

-                                  uint8* dst_argb, int dst_width) {

+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,

+                                                    ptrdiff_t src_stride,

+                                                    int src_stepx,

+                                                    uint8_t* dst_argb,

+                                                    int dst_width) {

   __asm {

     push       ebx

     push       esi

     push       edi

-    mov        eax, [esp + 12 + 4]    // src_argb

-    mov        esi, [esp + 12 + 8]    // src_stride

-    mov        ebx, [esp + 12 + 12]   // src_stepx

-    mov        edx, [esp + 12 + 16]   // dst_argb

-    mov        ecx, [esp + 12 + 20]   // dst_width

-    lea        esi, [eax + esi]       // row1 pointer

+    mov        eax, [esp + 12 + 4]  // src_argb

+    mov        esi, [esp + 12 + 8]  // src_stride

+    mov        ebx, [esp + 12 + 12]  // src_stepx

+    mov        edx, [esp + 12 + 16]  // dst_argb

+    mov        ecx, [esp + 12 + 20]  // dst_width

+    lea        esi, [eax + esi]  // row1 pointer

     lea        ebx, [ebx * 4]

     lea        edi, [ebx + ebx * 2]

@@ -1132,11 +1145,11 @@

     movq       xmm3, qword ptr [esi + ebx * 2]

     movhps     xmm3, qword ptr [esi + edi]

     lea        esi,  [esi + ebx * 4]

-    pavgb      xmm0, xmm2            // average rows

+    pavgb      xmm0, xmm2  // average rows

     pavgb      xmm1, xmm3

-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)

-    shufps     xmm0, xmm1, 0x88      // even pixels

-    shufps     xmm2, xmm1, 0xdd      // odd pixels

+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)

+    shufps     xmm0, xmm1, 0x88  // even pixels

+    shufps     xmm2, xmm1, 0xdd  // odd pixels

     pavgb      xmm0, xmm2

     movdqu     [edx], xmm0

     lea        edx, [edx + 16]

@@ -1151,29 +1164,31 @@

 // Column scaling unfiltered. SSE2 version.

-__declspec(naked)

-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

-                        int dst_width, int x, int dx) {

+__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,

+                                          const uint8_t* src_argb,

+                                          int dst_width,

+                                          int x,

+                                          int dx) {

   __asm {

     push       edi

     push       esi

-    mov        edi, [esp + 8 + 4]    // dst_argb

-    mov        esi, [esp + 8 + 8]    // src_argb

-    mov        ecx, [esp + 8 + 12]   // dst_width

+    mov        edi, [esp + 8 + 4]  // dst_argb

+    mov        esi, [esp + 8 + 8]  // src_argb

+    mov        ecx, [esp + 8 + 12]  // dst_width

     movd       xmm2, [esp + 8 + 16]  // x

     movd       xmm3, [esp + 8 + 20]  // dx

-    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0

-    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0

+    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0

+    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0

     paddd      xmm2, xmm0

-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2

-    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0

-    paddd      xmm2, xmm0            // x3 x2 x1 x0

-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4

-    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4

+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2

+    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0

+    paddd      xmm2, xmm0  // x3 x2 x1 x0

+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4

+    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4

-    pextrw     eax, xmm2, 1          // get x0 integer.

-    pextrw     edx, xmm2, 3          // get x1 integer.

+    pextrw     eax, xmm2, 1  // get x0 integer.

+    pextrw     edx, xmm2, 3  // get x1 integer.

     cmp        ecx, 0

     jle        xloop99

@@ -1180,24 +1195,24 @@

     sub        ecx, 4

     jl         xloop49

-    // 4 Pixel loop.

+        // 4 Pixel loop.

  xloop4:

     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels

     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels

-    pextrw     eax, xmm2, 5           // get x2 integer.

-    pextrw     edx, xmm2, 7           // get x3 integer.

-    paddd      xmm2, xmm3             // x += dx

-    punpckldq  xmm0, xmm1             // x0 x1

+    pextrw     eax, xmm2, 5  // get x2 integer.

+    pextrw     edx, xmm2, 7  // get x3 integer.

+    paddd      xmm2, xmm3  // x += dx

+    punpckldq  xmm0, xmm1  // x0 x1

     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels

     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels

-    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.

-    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.

-    punpckldq  xmm1, xmm4             // x2 x3

-    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3

+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.

+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.

+    punpckldq  xmm1, xmm4  // x2 x3

+    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3

     movdqu     [edi], xmm0

     lea        edi, [edi + 16]

-    sub        ecx, 4                 // 4 pixels

+    sub        ecx, 4  // 4 pixels

     jge        xloop4

  xloop49:

@@ -1204,11 +1219,11 @@

     test       ecx, 2

     je         xloop29

-    // 2 Pixels.

+        // 2 Pixels.

     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels

     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels

-    pextrw     eax, xmm2, 5           // get x2 integer.

-    punpckldq  xmm0, xmm1             // x0 x1

+    pextrw     eax, xmm2, 5  // get x2 integer.

+    punpckldq  xmm0, xmm1  // x0 x1

     movq       qword ptr [edi], xmm0

     lea        edi, [edi + 8]

@@ -1217,7 +1232,7 @@

     test       ecx, 1

     je         xloop99

-    // 1 Pixels.

+        // 1 Pixels.

     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels

     movd       dword ptr [edi], xmm0

  xloop99:

@@ -1232,60 +1247,62 @@

 // TODO(fbarchard): Port to Neon

 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw

-static uvec8 kShuffleColARGB = {

-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel

-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

+static const uvec8 kShuffleColARGB = {

+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel

+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

};

 // Shuffle table for duplicating 2 fractions into 8 bytes each

-static uvec8 kShuffleFractions = {

-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

+static const uvec8 kShuffleFractions = {

+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

};

-__declspec(naked)

-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

-                               int dst_width, int x, int dx) {

+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,

+                                                 const uint8_t* src_argb,

+                                                 int dst_width,

+                                                 int x,

+                                                 int dx) {

   __asm {

     push       esi

     push       edi

-    mov        edi, [esp + 8 + 4]    // dst_argb

-    mov        esi, [esp + 8 + 8]    // src_argb

-    mov        ecx, [esp + 8 + 12]   // dst_width

+    mov        edi, [esp + 8 + 4]  // dst_argb

+    mov        esi, [esp + 8 + 8]  // src_argb

+    mov        ecx, [esp + 8 + 12]  // dst_width

     movd       xmm2, [esp + 8 + 16]  // x

     movd       xmm3, [esp + 8 + 20]  // dx

     movdqa     xmm4, xmmword ptr kShuffleColARGB

     movdqa     xmm5, xmmword ptr kShuffleFractions

-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.

     psrlw      xmm6, 9

-    pextrw     eax, xmm2, 1         // get x0 integer. preroll

+    pextrw     eax, xmm2, 1  // get x0 integer. preroll

     sub        ecx, 2

     jl         xloop29

-    movdqa     xmm0, xmm2           // x1 = x0 + dx

+    movdqa     xmm0, xmm2  // x1 = x0 + dx

     paddd      xmm0, xmm3

-    punpckldq  xmm2, xmm0           // x0 x1

-    punpckldq  xmm3, xmm3           // dx dx

-    paddd      xmm3, xmm3           // dx * 2, dx * 2

-    pextrw     edx, xmm2, 3         // get x1 integer. preroll

+    punpckldq  xmm2, xmm0  // x0 x1

+    punpckldq  xmm3, xmm3  // dx dx

+    paddd      xmm3, xmm3  // dx * 2, dx * 2

+    pextrw     edx, xmm2, 3  // get x1 integer. preroll

     // 2 Pixel loop.

   xloop2:

-    movdqa     xmm1, xmm2           // x0, x1 fractions.

-    paddd      xmm2, xmm3           // x += dx

+    movdqa     xmm1, xmm2  // x0, x1 fractions.

+    paddd      xmm2, xmm3  // x += dx

     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels

-    psrlw      xmm1, 9              // 7 bit fractions.

+    psrlw      xmm1, 9  // 7 bit fractions.

     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels

-    pshufb     xmm1, xmm5           // 0000000011111111

-    pshufb     xmm0, xmm4           // arrange pixels into pairs

-    pxor       xmm1, xmm6           // 0..7f and 7f..0

-    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.

-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.

-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.

-    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.

-    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.

+    pshufb     xmm1, xmm5  // 0000000011111111

+    pshufb     xmm0, xmm4  // arrange pixels into pairs

+    pxor       xmm1, xmm6  // 0..7f and 7f..0

+    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.

+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.

+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.

+    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.

+    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.

     movq       qword ptr [edi], xmm0

     lea        edi, [edi + 8]

-    sub        ecx, 2               // 2 pixels

+    sub        ecx, 2  // 2 pixels

     jge        xloop2

  xloop29:

@@ -1293,15 +1310,15 @@

     add        ecx, 2 - 1

     jl         xloop99

-    // 1 pixel remainder

-    psrlw      xmm2, 9              // 7 bit fractions.

+            // 1 pixel remainder

+    psrlw      xmm2, 9  // 7 bit fractions.

     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels

-    pshufb     xmm2, xmm5           // 00000000

-    pshufb     xmm0, xmm4           // arrange pixels into pairs

-    pxor       xmm2, xmm6           // 0..7f and 7f..0

-    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.

+    pshufb     xmm2, xmm5  // 00000000

+    pshufb     xmm0, xmm4  // arrange pixels into pairs

+    pxor       xmm2, xmm6  // 0..7f and 7f..0

+    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.

     psrlw      xmm0, 7

-    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.

+    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.

     movd       [edi], xmm0

  xloop99:

@@ -1313,13 +1330,15 @@

 // Reads 4 pixels, duplicates them and writes 8 pixels.

-__declspec(naked)

-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

-                           int dst_width, int x, int dx) {

+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,

+                                             const uint8_t* src_argb,

+                                             int dst_width,

+                                             int x,

+                                             int dx) {

   __asm {

-    mov        edx, [esp + 4]    // dst_argb

-    mov        eax, [esp + 8]    // src_argb

-    mov        ecx, [esp + 12]   // dst_width

+    mov        edx, [esp + 4]  // dst_argb

+    mov        eax, [esp + 8]  // src_argb

+    mov        ecx, [esp + 12]  // dst_width

   wloop:

     movdqu     xmm0, [eax]

@@ -1338,12 +1357,11 @@

 // Divide num by div and return as 16.16 fixed point result.

-__declspec(naked)

-int FixedDiv_X86(int num, int div) {

+__declspec(naked) int FixedDiv_X86(int num, int div) {

   __asm {

-    mov        eax, [esp + 4]    // num

-    cdq                          // extend num to 64 bits

-    shld       edx, eax, 16      // 32.16

+    mov        eax, [esp + 4]  // num

+    cdq  // extend num to 64 bits

+    shld       edx, eax, 16  // 32.16

     shl        eax, 16

     idiv       dword ptr [esp + 8]

ret

@@ -1351,13 +1369,12 @@

 // Divide num by div and return as 16.16 fixed point result.

-__declspec(naked)

-int FixedDiv1_X86(int num, int div) {

+__declspec(naked) int FixedDiv1_X86(int num, int div) {

   __asm {

-    mov        eax, [esp + 4]    // num

-    mov        ecx, [esp + 8]    // denom

-    cdq                          // extend num to 64 bits

-    shld       edx, eax, 16      // 32.16

+    mov        eax, [esp + 4]  // num

+    mov        ecx, [esp + 8]  // denom

+    cdq  // extend num to 64 bits

+    shld       edx, eax, 16  // 32.16

     shl        eax, 16

     sub        eax, 0x00010001

     sbb        edx, 0

--- a/third_party/libyuv/source/video_common.cc

+++ b/third_party/libyuv/source/video_common.cc

@@ -8,7 +8,6 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

 #include "libyuv/video_common.h"

 #ifdef __cplusplus

@@ -16,40 +15,39 @@

 extern "C" {

 #endif

-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))

 struct FourCCAliasEntry {

-  uint32 alias;

-  uint32 canonical;

+  uint32_t alias;

+  uint32_t canonical;

};

-static const struct FourCCAliasEntry kFourCCAliases[] = {

-  {FOURCC_IYUV, FOURCC_I420},

-  {FOURCC_YU12, FOURCC_I420},

-  {FOURCC_YU16, FOURCC_I422},

-  {FOURCC_YU24, FOURCC_I444},

-  {FOURCC_YUYV, FOURCC_YUY2},

-  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs

-  {FOURCC_HDYC, FOURCC_UYVY},

-  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8

-  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.

-  {FOURCC_DMB1, FOURCC_MJPG},

-  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.

-  {FOURCC_RGB3, FOURCC_RAW },

-  {FOURCC_BGR3, FOURCC_24BG},

-  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB

-  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB

-  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555

-  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565

-  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551

+#define NUM_ALIASES 18

+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {

+    {FOURCC_IYUV, FOURCC_I420},

+    {FOURCC_YU12, FOURCC_I420},

+    {FOURCC_YU16, FOURCC_I422},

+    {FOURCC_YU24, FOURCC_I444},

+    {FOURCC_YUYV, FOURCC_YUY2},

+    {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs

+    {FOURCC_HDYC, FOURCC_UYVY},

+    {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8

+    {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.

+    {FOURCC_DMB1, FOURCC_MJPG},

+    {FOURCC_BA81, FOURCC_BGGR},  // deprecated.

+    {FOURCC_RGB3, FOURCC_RAW},

+    {FOURCC_BGR3, FOURCC_24BG},

+    {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB

+    {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB

+    {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555

+    {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565

+    {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551

};

 // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.

 //  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA

 LIBYUV_API

-uint32 CanonicalFourCC(uint32 fourcc) {

+uint32_t CanonicalFourCC(uint32_t fourcc) {

   int i;

-  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {

+  for (i = 0; i < NUM_ALIASES; ++i) {

     if (kFourCCAliases[i].alias == fourcc) {

       return kFourCCAliases[i].canonical;

@@ -62,4 +60,3 @@

 }  // extern "C"

 }  // namespace libyuv

 #endif

--

⑨