shithub: libvpx

--- /dev/null

+++ b/test/avg_test.cc

@@ -1,0 +1,411 @@

+/*

+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <limits.h>

+#include <stdio.h>

+#include <string.h>

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

+#include "test/acm_random.h"

+#include "test/clear_system_state.h"

+#include "test/register_state_check.h"

+#include "test/util.h"

+#include "vpx_mem/vpx_mem.h"

+using libvpx_test::ACMRandom;

+namespace {

+class AverageTestBase : public ::testing::Test {

+ public:

+  AverageTestBase(int width, int height) : width_(width), height_(height) {}

+  static void SetUpTestCase() {

+    source_data_ = reinterpret_cast<uint8_t*>(

+        vpx_memalign(kDataAlignment, kDataBlockSize));

+  }

+  static void TearDownTestCase() {

+    vpx_free(source_data_);

+    source_data_ = NULL;

+  }

+  virtual void TearDown() {

+    libvpx_test::ClearSystemState();

+  }

+ protected:

+  // Handle blocks up to 4 blocks 64x64 with stride up to 128

+  static const int kDataAlignment = 16;

+  static const int kDataBlockSize = 64 * 128;

+  virtual void SetUp() {

+    source_stride_ = (width_ + 31) & ~31;

+    rnd_.Reset(ACMRandom::DeterministicSeed());

+  }

+  // Sum Pixels

+  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch ) {

+    unsigned int average = 0;

+    for (int h = 0; h < 8; ++h)

+      for (int w = 0; w < 8; ++w)

+        average += source[h * source_stride_ + w];

+    return ((average + 32) >> 6);

+  }

+  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch ) {

+    unsigned int average = 0;

+    for (int h = 0; h < 4; ++h)

+      for (int w = 0; w < 4; ++w)

+        average += source[h * source_stride_ + w];

+    return ((average + 8) >> 4);

+  }

+  void FillConstant(uint8_t fill_constant) {

+    for (int i = 0; i < width_ * height_; ++i) {

+        source_data_[i] = fill_constant;

+    }

+  }

+  void FillRandom() {

+    for (int i = 0; i < width_ * height_; ++i) {

+        source_data_[i] = rnd_.Rand8();

+    }

+  }

+  int width_, height_;

+  static uint8_t* source_data_;

+  int source_stride_;

+  ACMRandom rnd_;

+};

+typedef unsigned int (*AverageFunction)(const uint8_t* s, int pitch);

+typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;

+class AverageTest

+    : public AverageTestBase,

+      public ::testing::WithParamInterface<AvgFunc>{

+ public:

+  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}

+ protected:

+  void CheckAverages() {

+    unsigned int expected = 0;

+    if (GET_PARAM(3) == 8) {

+      expected = ReferenceAverage8x8(source_data_+ GET_PARAM(2),

+                                     source_stride_);

+    } else  if (GET_PARAM(3) == 4) {

+      expected = ReferenceAverage4x4(source_data_+ GET_PARAM(2),

+                                     source_stride_);

+    }

+    ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(source_data_+ GET_PARAM(2),

+                                          source_stride_));

+    unsigned int actual = GET_PARAM(4)(source_data_+ GET_PARAM(2),

+                                       source_stride_);

+    EXPECT_EQ(expected, actual);

+  }

+};

+typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,

+                              const int ref_stride, const int height);

+typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;

+class IntProRowTest

+    : public AverageTestBase,

+      public ::testing::WithParamInterface<IntProRowParam> {

+ public:

+  IntProRowTest()

+    : AverageTestBase(16, GET_PARAM(0)),

+      hbuf_asm_(NULL),

+      hbuf_c_(NULL) {

+    asm_func_ = GET_PARAM(1);

+    c_func_ = GET_PARAM(2);

+  }

+ protected:

+  virtual void SetUp() {

+    hbuf_asm_ = reinterpret_cast<int16_t*>(

+        vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));

+    hbuf_c_ = reinterpret_cast<int16_t*>(

+        vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));

+  }

+  virtual void TearDown() {

+    vpx_free(hbuf_c_);

+    hbuf_c_ = NULL;

+    vpx_free(hbuf_asm_);

+    hbuf_asm_ = NULL;

+  }

+  void RunComparison() {

+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));

+    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));

+    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))

+        << "Output mismatch";

+  }

+ private:

+  IntProRowFunc asm_func_;

+  IntProRowFunc c_func_;

+  int16_t *hbuf_asm_;

+  int16_t *hbuf_c_;

+};

+typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);

+typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;

+class IntProColTest

+    : public AverageTestBase,

+      public ::testing::WithParamInterface<IntProColParam> {

+ public:

+  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {

+    asm_func_ = GET_PARAM(1);

+    c_func_ = GET_PARAM(2);

+  }

+ protected:

+  void RunComparison() {

+    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));

+    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));

+    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";

+  }

+ private:

+  IntProColFunc asm_func_;

+  IntProColFunc c_func_;

+  int16_t sum_asm_;

+  int16_t sum_c_;

+};

+typedef int (*SatdFunc)(const int16_t *coeffs, int length);

+typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;

+class SatdTest

+    : public ::testing::Test,

+      public ::testing::WithParamInterface<SatdTestParam> {

+ protected:

+  virtual void SetUp() {

+    satd_size_ = GET_PARAM(0);

+    satd_func_ = GET_PARAM(1);

+    rnd_.Reset(ACMRandom::DeterministicSeed());

+    src_ = reinterpret_cast<int16_t*>(

+        vpx_memalign(16, sizeof(*src_) * satd_size_));

+    ASSERT_TRUE(src_ != NULL);

+  }

+  virtual void TearDown() {

+    libvpx_test::ClearSystemState();

+    vpx_free(src_);

+  }

+  void FillConstant(const int16_t val) {

+    for (int i = 0; i < satd_size_; ++i) src_[i] = val;

+  }

+  void FillRandom() {

+    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();

+  }

+  void Check(const int expected) {

+    int total;

+    ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));

+    EXPECT_EQ(expected, total);

+  }

+  int satd_size_;

+ private:

+  int16_t *src_;

+  SatdFunc satd_func_;

+  ACMRandom rnd_;

+};

+uint8_t* AverageTestBase::source_data_ = NULL;

+TEST_P(AverageTest, MinValue) {

+  FillConstant(0);

+  CheckAverages();

+}

+TEST_P(AverageTest, MaxValue) {

+  FillConstant(255);

+  CheckAverages();

+}

+TEST_P(AverageTest, Random) {

+  // The reference frame, but not the source frame, may be unaligned for

+  // certain types of searches.

+  for (int i = 0; i < 1000; i++) {

+    FillRandom();

+    CheckAverages();

+  }

+}

+TEST_P(IntProRowTest, MinValue) {

+  FillConstant(0);

+  RunComparison();

+}

+TEST_P(IntProRowTest, MaxValue) {

+  FillConstant(255);

+  RunComparison();

+}

+TEST_P(IntProRowTest, Random) {

+  FillRandom();

+  RunComparison();

+}

+TEST_P(IntProColTest, MinValue) {

+  FillConstant(0);

+  RunComparison();

+}

+TEST_P(IntProColTest, MaxValue) {

+  FillConstant(255);

+  RunComparison();

+}

+TEST_P(IntProColTest, Random) {

+  FillRandom();

+  RunComparison();

+}

+TEST_P(SatdTest, MinValue) {

+  const int kMin = -32640;

+  const int expected = -kMin * satd_size_;

+  FillConstant(kMin);

+  Check(expected);

+}

+TEST_P(SatdTest, MaxValue) {

+  const int kMax = 32640;

+  const int expected = kMax * satd_size_;

+  FillConstant(kMax);

+  Check(expected);

+}

+TEST_P(SatdTest, Random) {

+  int expected;

+  switch (satd_size_) {

+    case 16: expected = 205298; break;

+    case 64: expected = 1113950; break;

+    case 256: expected = 4268415; break;

+    case 1024: expected = 16954082; break;

+    default:

+      FAIL() << "Invalid satd size (" << satd_size_

+             << ") valid: 16/64/256/1024";

+  }

+  FillRandom();

+  Check(expected);

+}

+using std::tr1::make_tuple;

+INSTANTIATE_TEST_CASE_P(

+    C, AverageTest,

+    ::testing::Values(

+        make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),

+        make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));

+INSTANTIATE_TEST_CASE_P(

+    C, SatdTest,

+    ::testing::Values(

+        make_tuple(16, &vpx_satd_c),

+        make_tuple(64, &vpx_satd_c),

+        make_tuple(256, &vpx_satd_c),

+        make_tuple(1024, &vpx_satd_c)));

+#if HAVE_SSE2

+INSTANTIATE_TEST_CASE_P(

+    SSE2, AverageTest,

+    ::testing::Values(

+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_sse2),

+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_sse2),

+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_sse2),

+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_sse2),

+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_sse2),

+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_sse2)));

+INSTANTIATE_TEST_CASE_P(

+    SSE2, IntProRowTest, ::testing::Values(

+        make_tuple(16, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),

+        make_tuple(32, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c),

+        make_tuple(64, &vpx_int_pro_row_sse2, &vpx_int_pro_row_c)));

+INSTANTIATE_TEST_CASE_P(

+    SSE2, IntProColTest, ::testing::Values(

+        make_tuple(16, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),

+        make_tuple(32, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c),

+        make_tuple(64, &vpx_int_pro_col_sse2, &vpx_int_pro_col_c)));

+INSTANTIATE_TEST_CASE_P(

+    SSE2, SatdTest,

+    ::testing::Values(

+        make_tuple(16, &vpx_satd_sse2),

+        make_tuple(64, &vpx_satd_sse2),

+        make_tuple(256, &vpx_satd_sse2),

+        make_tuple(1024, &vpx_satd_sse2)));

+#endif

+#if HAVE_NEON

+INSTANTIATE_TEST_CASE_P(

+    NEON, AverageTest,

+    ::testing::Values(

+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_neon),

+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_neon),

+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_neon),

+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_neon),

+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),

+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));

+INSTANTIATE_TEST_CASE_P(

+    NEON, IntProRowTest, ::testing::Values(

+        make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),

+        make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),

+        make_tuple(64, &vpx_int_pro_row_neon, &vpx_int_pro_row_c)));

+INSTANTIATE_TEST_CASE_P(

+    NEON, IntProColTest, ::testing::Values(

+        make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),

+        make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),

+        make_tuple(64, &vpx_int_pro_col_neon, &vpx_int_pro_col_c)));

+INSTANTIATE_TEST_CASE_P(

+    NEON, SatdTest,

+    ::testing::Values(

+        make_tuple(16, &vpx_satd_neon),

+        make_tuple(64, &vpx_satd_neon),

+        make_tuple(256, &vpx_satd_neon),

+        make_tuple(1024, &vpx_satd_neon)));

+#endif

+#if HAVE_MSA

+INSTANTIATE_TEST_CASE_P(

+    MSA, AverageTest,

+    ::testing::Values(

+        make_tuple(16, 16, 0, 8, &vpx_avg_8x8_msa),

+        make_tuple(16, 16, 5, 8, &vpx_avg_8x8_msa),

+        make_tuple(32, 32, 15, 8, &vpx_avg_8x8_msa),

+        make_tuple(16, 16, 0, 4, &vpx_avg_4x4_msa),

+        make_tuple(16, 16, 5, 4, &vpx_avg_4x4_msa),

+        make_tuple(32, 32, 15, 4, &vpx_avg_4x4_msa)));

+#endif

+}  // namespace

--- a/test/test.mk

+++ b/test/test.mk

@@ -143,7 +143,6 @@

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc

-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_avg_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc

@@ -170,6 +169,11 @@

 endif # VP10

 ## Multi-codec / unconditional whitebox tests.

+ifeq ($(findstring yes,$(CONFIG_VP9_ENCODER)$(CONFIG_VP10_ENCODER)),yes)

+LIBVPX_TEST_SRCS-yes += avg_test.cc

+endif

 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc

 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc

--- a/test/vp9_avg_test.cc

+++ /dev/null

@@ -1,413 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <limits.h>

-#include <stdio.h>

-#include <string.h>

-#include "third_party/googletest/src/include/gtest/gtest.h"

-#include "./vpx_config.h"

-#if CONFIG_VP9_ENCODER

-#include "./vp9_rtcd.h"

-#endif

-#include "test/acm_random.h"

-#include "test/clear_system_state.h"

-#include "test/register_state_check.h"

-#include "test/util.h"

-#include "vpx_mem/vpx_mem.h"

-using libvpx_test::ACMRandom;

-namespace {

-class AverageTestBase : public ::testing::Test {

- public:

-  AverageTestBase(int width, int height) : width_(width), height_(height) {}

-  static void SetUpTestCase() {

-    source_data_ = reinterpret_cast<uint8_t*>(

-        vpx_memalign(kDataAlignment, kDataBlockSize));

-  }

-  static void TearDownTestCase() {

-    vpx_free(source_data_);

-    source_data_ = NULL;

-  }

-  virtual void TearDown() {

-    libvpx_test::ClearSystemState();

-  }

- protected:

-  // Handle blocks up to 4 blocks 64x64 with stride up to 128

-  static const int kDataAlignment = 16;

-  static const int kDataBlockSize = 64 * 128;

-  virtual void SetUp() {

-    source_stride_ = (width_ + 31) & ~31;

-    rnd_.Reset(ACMRandom::DeterministicSeed());

-  }

-  // Sum Pixels

-  unsigned int ReferenceAverage8x8(const uint8_t* source, int pitch ) {

-    unsigned int average = 0;

-    for (int h = 0; h < 8; ++h)

-      for (int w = 0; w < 8; ++w)

-        average += source[h * source_stride_ + w];

-    return ((average + 32) >> 6);

-  }

-  unsigned int ReferenceAverage4x4(const uint8_t* source, int pitch ) {

-    unsigned int average = 0;

-    for (int h = 0; h < 4; ++h)

-      for (int w = 0; w < 4; ++w)

-        average += source[h * source_stride_ + w];

-    return ((average + 8) >> 4);

-  }

-  void FillConstant(uint8_t fill_constant) {

-    for (int i = 0; i < width_ * height_; ++i) {

-        source_data_[i] = fill_constant;

-    }

-  }

-  void FillRandom() {

-    for (int i = 0; i < width_ * height_; ++i) {

-        source_data_[i] = rnd_.Rand8();

-    }

-  }

-  int width_, height_;

-  static uint8_t* source_data_;

-  int source_stride_;

-  ACMRandom rnd_;

-};

-typedef unsigned int (*AverageFunction)(const uint8_t* s, int pitch);

-typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;

-class AverageTest

-    : public AverageTestBase,

-      public ::testing::WithParamInterface<AvgFunc>{

- public:

-  AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}

- protected:

-  void CheckAverages() {

-    unsigned int expected = 0;

-    if (GET_PARAM(3) == 8) {

-      expected = ReferenceAverage8x8(source_data_+ GET_PARAM(2),

-                                     source_stride_);

-    } else  if (GET_PARAM(3) == 4) {

-      expected = ReferenceAverage4x4(source_data_+ GET_PARAM(2),

-                                     source_stride_);

-    }

-    ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(source_data_+ GET_PARAM(2),

-                                          source_stride_));

-    unsigned int actual = GET_PARAM(4)(source_data_+ GET_PARAM(2),

-                                       source_stride_);

-    EXPECT_EQ(expected, actual);

-  }

-};

-typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,

-                              const int ref_stride, const int height);

-typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;

-class IntProRowTest

-    : public AverageTestBase,

-      public ::testing::WithParamInterface<IntProRowParam> {

- public:

-  IntProRowTest()

-    : AverageTestBase(16, GET_PARAM(0)),

-      hbuf_asm_(NULL),

-      hbuf_c_(NULL) {

-    asm_func_ = GET_PARAM(1);

-    c_func_ = GET_PARAM(2);

-  }

- protected:

-  virtual void SetUp() {

-    hbuf_asm_ = reinterpret_cast<int16_t*>(

-        vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));

-    hbuf_c_ = reinterpret_cast<int16_t*>(

-        vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));

-  }

-  virtual void TearDown() {

-    vpx_free(hbuf_c_);

-    hbuf_c_ = NULL;

-    vpx_free(hbuf_asm_);

-    hbuf_asm_ = NULL;

-  }

-  void RunComparison() {

-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));

-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));

-    EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))

-        << "Output mismatch";

-  }

- private:

-  IntProRowFunc asm_func_;

-  IntProRowFunc c_func_;

-  int16_t *hbuf_asm_;

-  int16_t *hbuf_c_;

-};

-typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);

-typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;

-class IntProColTest

-    : public AverageTestBase,

-      public ::testing::WithParamInterface<IntProColParam> {

- public:

-  IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {

-    asm_func_ = GET_PARAM(1);

-    c_func_ = GET_PARAM(2);

-  }

- protected:

-  void RunComparison() {

-    ASM_REGISTER_STATE_CHECK(sum_c_ = c_func_(source_data_, width_));

-    ASM_REGISTER_STATE_CHECK(sum_asm_ = asm_func_(source_data_, width_));

-    EXPECT_EQ(sum_c_, sum_asm_) << "Output mismatch";

-  }

- private:

-  IntProColFunc asm_func_;

-  IntProColFunc c_func_;

-  int16_t sum_asm_;

-  int16_t sum_c_;

-};

-typedef int (*SatdFunc)(const int16_t *coeffs, int length);

-typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;

-class SatdTest

-    : public ::testing::Test,

-      public ::testing::WithParamInterface<SatdTestParam> {

- protected:

-  virtual void SetUp() {

-    satd_size_ = GET_PARAM(0);

-    satd_func_ = GET_PARAM(1);

-    rnd_.Reset(ACMRandom::DeterministicSeed());

-    src_ = reinterpret_cast<int16_t*>(

-        vpx_memalign(16, sizeof(*src_) * satd_size_));

-    ASSERT_TRUE(src_ != NULL);

-  }

-  virtual void TearDown() {

-    libvpx_test::ClearSystemState();

-    vpx_free(src_);

-  }

-  void FillConstant(const int16_t val) {

-    for (int i = 0; i < satd_size_; ++i) src_[i] = val;

-  }

-  void FillRandom() {

-    for (int i = 0; i < satd_size_; ++i) src_[i] = rnd_.Rand16();

-  }

-  void Check(const int expected) {

-    int total;

-    ASM_REGISTER_STATE_CHECK(total = satd_func_(src_, satd_size_));

-    EXPECT_EQ(expected, total);

-  }

-  int satd_size_;

- private:

-  int16_t *src_;

-  SatdFunc satd_func_;

-  ACMRandom rnd_;

-};

-uint8_t* AverageTestBase::source_data_ = NULL;

-TEST_P(AverageTest, MinValue) {

-  FillConstant(0);

-  CheckAverages();

-}

-TEST_P(AverageTest, MaxValue) {

-  FillConstant(255);

-  CheckAverages();

-}

-TEST_P(AverageTest, Random) {

-  // The reference frame, but not the source frame, may be unaligned for

-  // certain types of searches.

-  for (int i = 0; i < 1000; i++) {

-    FillRandom();

-    CheckAverages();

-  }

-}

-TEST_P(IntProRowTest, MinValue) {

-  FillConstant(0);

-  RunComparison();

-}

-TEST_P(IntProRowTest, MaxValue) {

-  FillConstant(255);

-  RunComparison();

-}

-TEST_P(IntProRowTest, Random) {

-  FillRandom();

-  RunComparison();

-}

-TEST_P(IntProColTest, MinValue) {

-  FillConstant(0);

-  RunComparison();

-}

-TEST_P(IntProColTest, MaxValue) {

-  FillConstant(255);

-  RunComparison();

-}

-TEST_P(IntProColTest, Random) {

-  FillRandom();

-  RunComparison();

-}

-TEST_P(SatdTest, MinValue) {

-  const int kMin = -32640;

-  const int expected = -kMin * satd_size_;

-  FillConstant(kMin);

-  Check(expected);

-}

-TEST_P(SatdTest, MaxValue) {

-  const int kMax = 32640;

-  const int expected = kMax * satd_size_;

-  FillConstant(kMax);

-  Check(expected);

-}

-TEST_P(SatdTest, Random) {

-  int expected;

-  switch (satd_size_) {

-    case 16: expected = 205298; break;

-    case 64: expected = 1113950; break;

-    case 256: expected = 4268415; break;

-    case 1024: expected = 16954082; break;

-    default:

-      FAIL() << "Invalid satd size (" << satd_size_

-             << ") valid: 16/64/256/1024";

-  }

-  FillRandom();

-  Check(expected);

-}

-using std::tr1::make_tuple;

-INSTANTIATE_TEST_CASE_P(

-    C, AverageTest,

-    ::testing::Values(

-        make_tuple(16, 16, 1, 8, &vp9_avg_8x8_c),

-        make_tuple(16, 16, 1, 4, &vp9_avg_4x4_c)));

-INSTANTIATE_TEST_CASE_P(

-    C, SatdTest,

-    ::testing::Values(

-        make_tuple(16, &vp9_satd_c),

-        make_tuple(64, &vp9_satd_c),

-        make_tuple(256, &vp9_satd_c),

-        make_tuple(1024, &vp9_satd_c)));

-#if HAVE_SSE2

-INSTANTIATE_TEST_CASE_P(

-    SSE2, AverageTest,

-    ::testing::Values(

-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_sse2),

-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_sse2),

-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_sse2),

-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_sse2),

-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_sse2),

-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_sse2)));

-INSTANTIATE_TEST_CASE_P(

-    SSE2, IntProRowTest, ::testing::Values(

-        make_tuple(16, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),

-        make_tuple(32, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c),

-        make_tuple(64, &vp9_int_pro_row_sse2, &vp9_int_pro_row_c)));

-INSTANTIATE_TEST_CASE_P(

-    SSE2, IntProColTest, ::testing::Values(

-        make_tuple(16, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),

-        make_tuple(32, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c),

-        make_tuple(64, &vp9_int_pro_col_sse2, &vp9_int_pro_col_c)));

-INSTANTIATE_TEST_CASE_P(

-    SSE2, SatdTest,

-    ::testing::Values(

-        make_tuple(16, &vp9_satd_sse2),

-        make_tuple(64, &vp9_satd_sse2),

-        make_tuple(256, &vp9_satd_sse2),

-        make_tuple(1024, &vp9_satd_sse2)));

-#endif

-#if HAVE_NEON

-INSTANTIATE_TEST_CASE_P(

-    NEON, AverageTest,

-    ::testing::Values(

-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_neon),

-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),

-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon),

-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_neon),

-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_neon),

-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_neon)));

-INSTANTIATE_TEST_CASE_P(

-    NEON, IntProRowTest, ::testing::Values(

-        make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),

-        make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),

-        make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));

-INSTANTIATE_TEST_CASE_P(

-    NEON, IntProColTest, ::testing::Values(

-        make_tuple(16, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),

-        make_tuple(32, &vp9_int_pro_col_neon, &vp9_int_pro_col_c),

-        make_tuple(64, &vp9_int_pro_col_neon, &vp9_int_pro_col_c)));

-INSTANTIATE_TEST_CASE_P(

-    NEON, SatdTest,

-    ::testing::Values(

-        make_tuple(16, &vp9_satd_neon),

-        make_tuple(64, &vp9_satd_neon),

-        make_tuple(256, &vp9_satd_neon),

-        make_tuple(1024, &vp9_satd_neon)));

-#endif

-#if HAVE_MSA

-INSTANTIATE_TEST_CASE_P(

-    MSA, AverageTest,

-    ::testing::Values(

-        make_tuple(16, 16, 0, 8, &vp9_avg_8x8_msa),

-        make_tuple(16, 16, 5, 8, &vp9_avg_8x8_msa),

-        make_tuple(32, 32, 15, 8, &vp9_avg_8x8_msa),

-        make_tuple(16, 16, 0, 4, &vp9_avg_4x4_msa),

-        make_tuple(16, 16, 5, 4, &vp9_avg_4x4_msa),

-        make_tuple(32, 32, 15, 4, &vp9_avg_4x4_msa)));

-#endif

-}  // namespace

--- a/vp10/common/vp10_rtcd_defs.pl

+++ b/vp10/common/vp10_rtcd_defs.pl

@@ -351,42 +351,6 @@

 if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {

-add_proto qw/unsigned int vp10_avg_8x8/, "const uint8_t *, int p";

-specialize qw/vp10_avg_8x8 sse2 neon msa/;

-add_proto qw/unsigned int vp10_avg_4x4/, "const uint8_t *, int p";

-specialize qw/vp10_avg_4x4 sse2 msa/;

-add_proto qw/void vp10_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";

-specialize qw/vp10_minmax_8x8 sse2/;

-add_proto qw/void vp10_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";

-specialize qw/vp10_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";

-add_proto qw/void vp10_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";

-specialize qw/vp10_hadamard_16x16 sse2/;

-add_proto qw/int16_t vp10_satd/, "const int16_t *coeff, int length";

-specialize qw/vp10_satd sse2/;

-add_proto qw/void vp10_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";

-specialize qw/vp10_int_pro_row sse2 neon/;

-add_proto qw/int16_t vp10_int_pro_col/, "uint8_t const *ref, const int width";

-specialize qw/vp10_int_pro_col sse2 neon/;

-add_proto qw/int vp10_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";

-specialize qw/vp10_vector_var neon sse2/;

-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

-  add_proto qw/unsigned int vp10_highbd_avg_8x8/, "const uint8_t *, int p";

-  specialize qw/vp10_highbd_avg_8x8/;

-  add_proto qw/unsigned int vp10_highbd_avg_4x4/, "const uint8_t *, int p";

-  specialize qw/vp10_highbd_avg_4x4/;

-  add_proto qw/void vp10_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";

-  specialize qw/vp10_highbd_minmax_8x8/;

-}

 # ENCODEMB INVOKE

--- a/vp10/encoder/arm/neon/avg_neon.c

+++ /dev/null

@@ -1,160 +1,0 @@

-/*

- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include <assert.h>

-#include "./vp10_rtcd.h"

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {

-  const uint32x4_t a = vpaddlq_u16(v_16x8);

-  const uint64x2_t b = vpaddlq_u32(a);

-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

-                                vreinterpret_u32_u64(vget_high_u64(b)));

-  return vget_lane_u32(c, 0);

-}

-unsigned int vp10_avg_8x8_neon(const uint8_t *s, int p) {

-  uint8x8_t v_s0 = vld1_u8(s);

-  const uint8x8_t v_s1 = vld1_u8(s + p);

-  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);

-  v_s0 = vld1_u8(s + 2 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 3 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 4 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 5 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 6 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 7 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  return (horizontal_add_u16x8(v_sum) + 32) >> 6;

-}

-void vp10_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,

-                          const int ref_stride, const int height) {

-  int i;

-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);

-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);

-  const int shift_factor = ((height >> 5) + 3) * -1;

-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);

-  for (i = 0; i < height; i += 8) {

-    const uint8x16_t vec_row1 = vld1q_u8(ref);

-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);

-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);

-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);

-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);

-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);

-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);

-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));

-    ref += ref_stride * 8;

-  }

-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);

-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);

-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));

-  hbuf += 8;

-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));

-}

-int16_t vp10_int_pro_col_neon(uint8_t const *ref, const int width) {

-  int i;

-  uint16x8_t vec_sum = vdupq_n_u16(0);

-  for (i = 0; i < width; i += 16) {

-    const uint8x16_t vec_row = vld1q_u8(ref);

-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));

-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));

-    ref += 16;

-  }

-  return horizontal_add_u16x8(vec_sum);

-}

-// ref, src = [0, 510] - max diff = 16-bits

-// bwl = {2, 3, 4}, width = {16, 32, 64}

-int vp10_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {

-  int width = 4 << bwl;

-  int32x4_t sse = vdupq_n_s32(0);

-  int16x8_t total = vdupq_n_s16(0);

-  assert(width >= 8);

-  assert((width % 8) == 0);

-  do {

-    const int16x8_t r = vld1q_s16(ref);

-    const int16x8_t s = vld1q_s16(src);

-    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.

-    const int16x4_t diff_lo = vget_low_s16(diff);

-    const int16x4_t diff_hi = vget_high_s16(diff);

-    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.

-    sse = vmlal_s16(sse, diff_hi, diff_hi);

-    total = vaddq_s16(total, diff);  // dynamic range 16 bits.

-    ref += 8;

-    src += 8;

-    width -= 8;

-  } while (width != 0);

-  {

-    // Note: 'total''s pairwise addition could be implemented similarly to

-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired

-    // with the summation of 'sse' performed better on a Cortex-A15.

-    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'

-    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));

-    const int32x2_t t2 = vpadd_s32(t1, t1);

-    const int t = vget_lane_s32(t2, 0);

-    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.

-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),

-                                  vreinterpret_s32_s64(vget_high_s64(s0)));

-    const int s = vget_lane_s32(s1, 0);

-    const int shift_factor = bwl + 2;

-    return s - ((t * t) >> shift_factor);

-  }

-}

--- a/vp10/encoder/avg.c

+++ /dev/null

@@ -1,230 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp10_rtcd.h"

-#include "vp10/common/common.h"

-#include "vpx_ports/mem.h"

-unsigned int vp10_avg_8x8_c(const uint8_t *s, int p) {

-  int i, j;

-  int sum = 0;

-  for (i = 0; i < 8; ++i, s+=p)

-    for (j = 0; j < 8; sum += s[j], ++j) {}

-  return (sum + 32) >> 6;

-}

-unsigned int vp10_avg_4x4_c(const uint8_t *s, int p) {

-  int i, j;

-  int sum = 0;

-  for (i = 0; i < 4; ++i, s+=p)

-    for (j = 0; j < 4; sum += s[j], ++j) {}

-  return (sum + 8) >> 4;

-}

-// src_diff: first pass, 9 bit, dynamic range [-255, 255]

-//           second pass, 12 bit, dynamic range [-2040, 2040]

-static void hadamard_col8(const int16_t *src_diff, int src_stride,

-                          int16_t *coeff) {

-  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];

-  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];

-  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];

-  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];

-  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];

-  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];

-  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];

-  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];

-  int16_t c0 = b0 + b2;

-  int16_t c1 = b1 + b3;

-  int16_t c2 = b0 - b2;

-  int16_t c3 = b1 - b3;

-  int16_t c4 = b4 + b6;

-  int16_t c5 = b5 + b7;

-  int16_t c6 = b4 - b6;

-  int16_t c7 = b5 - b7;

-  coeff[0] = c0 + c4;

-  coeff[7] = c1 + c5;

-  coeff[3] = c2 + c6;

-  coeff[4] = c3 + c7;

-  coeff[2] = c0 - c4;

-  coeff[6] = c1 - c5;

-  coeff[1] = c2 - c6;

-  coeff[5] = c3 - c7;

-}

-void vp10_hadamard_8x8_c(int16_t const *src_diff, int src_stride,

-                        int16_t *coeff) {

-  int idx;

-  int16_t buffer[64];

-  int16_t *tmp_buf = &buffer[0];

-  for (idx = 0; idx < 8; ++idx) {

-    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit

-                                                   // dynamic range [-255, 255]

-    tmp_buf += 8;

-    ++src_diff;

-  }

-  tmp_buf = &buffer[0];

-  for (idx = 0; idx < 8; ++idx) {

-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit

-                                       // dynamic range [-2040, 2040]

-    coeff += 8;  // coeff: 15 bit

-                 // dynamic range [-16320, 16320]

-    ++tmp_buf;

-  }

-}

-// In place 16x16 2D Hadamard transform

-void vp10_hadamard_16x16_c(int16_t const *src_diff, int src_stride,

-                          int16_t *coeff) {

-  int idx;

-  for (idx = 0; idx < 4; ++idx) {

-    // src_diff: 9 bit, dynamic range [-255, 255]

-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride

-                                + (idx & 0x01) * 8;

-    vp10_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);

-  }

-  // coeff: 15 bit, dynamic range [-16320, 16320]

-  for (idx = 0; idx < 64; ++idx) {

-    int16_t a0 = coeff[0];

-    int16_t a1 = coeff[64];

-    int16_t a2 = coeff[128];

-    int16_t a3 = coeff[192];

-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]

-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range

-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]

-    int16_t b3 = (a2 - a3) >> 1;

-    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]

-    coeff[64]  = b1 + b3;

-    coeff[128] = b0 - b2;

-    coeff[192] = b1 - b3;

-    ++coeff;

-  }

-}

-// coeff: 16 bits, dynamic range [-32640, 32640].

-// length: value range {16, 64, 256, 1024}.

-int16_t vp10_satd_c(const int16_t *coeff, int length) {

-  int i;

-  int satd = 0;

-  for (i = 0; i < length; ++i)

-    satd += abs(coeff[i]);

-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]

-  return (int16_t)satd;

-}

-// Integer projection onto row vectors.

-// height: value range {16, 32, 64}.

-void vp10_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,

-                       const int ref_stride, const int height) {

-  int idx;

-  const int norm_factor = height >> 1;

-  for (idx = 0; idx < 16; ++idx) {

-    int i;

-    hbuf[idx] = 0;

-    // hbuf[idx]: 14 bit, dynamic range [0, 16320].

-    for (i = 0; i < height; ++i)

-      hbuf[idx] += ref[i * ref_stride];

-    // hbuf[idx]: 9 bit, dynamic range [0, 510].

-    hbuf[idx] /= norm_factor;

-    ++ref;

-  }

-}

-// width: value range {16, 32, 64}.

-int16_t vp10_int_pro_col_c(uint8_t const *ref, const int width) {

-  int idx;

-  int16_t sum = 0;

-  // sum: 14 bit, dynamic range [0, 16320]

-  for (idx = 0; idx < width; ++idx)

-    sum += ref[idx];

-  return sum;

-}

-// ref: [0 - 510]

-// src: [0 - 510]

-// bwl: {2, 3, 4}

-int vp10_vector_var_c(int16_t const *ref, int16_t const *src,

-                     const int bwl) {

-  int i;

-  int width = 4 << bwl;

-  int sse = 0, mean = 0, var;

-  for (i = 0; i < width; ++i) {

-    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.

-    mean += diff;                // mean: dynamic range 16 bits.

-    sse += diff * diff;          // sse:  dynamic range 26 bits.

-  }

-  // (mean * mean): dynamic range 31 bits.

-  var = sse - ((mean * mean) >> (bwl + 2));

-  return var;

-}

-void vp10_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,

-                      int *min, int *max) {

-  int i, j;

-  *min = 255;

-  *max = 0;

-  for (i = 0; i < 8; ++i, s += p, d += dp) {

-    for (j = 0; j < 8; ++j) {

-      int diff = abs(s[j]-d[j]);

-      *min = diff < *min ? diff : *min;

-      *max = diff > *max ? diff : *max;

-    }

-  }

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-unsigned int vp10_highbd_avg_8x8_c(const uint8_t *s8, int p) {

-  int i, j;

-  int sum = 0;

-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

-  for (i = 0; i < 8; ++i, s+=p)

-    for (j = 0; j < 8; sum += s[j], ++j) {}

-  return (sum + 32) >> 6;

-}

-unsigned int vp10_highbd_avg_4x4_c(const uint8_t *s8, int p) {

-  int i, j;

-  int sum = 0;

-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

-  for (i = 0; i < 4; ++i, s+=p)

-    for (j = 0; j < 4; sum += s[j], ++j) {}

-  return (sum + 8) >> 4;

-}

-void vp10_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,

-                             int dp, int *min, int *max) {

-  int i, j;

-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

-  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);

-  *min = 255;

-  *max = 0;

-  for (i = 0; i < 8; ++i, s += p, d += dp) {

-    for (j = 0; j < 8; ++j) {

-      int diff = abs(s[j]-d[j]);

-      *min = diff < *min ? diff : *min;

-      *max = diff > *max ? diff : *max;

-    }

-  }

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp10/encoder/encodeframe.c

+++ b/vp10/encoder/encodeframe.c

@@ -536,16 +536,16 @@

     if (x8_idx < pixels_wide && y8_idx < pixels_high) {

 #if CONFIG_VP9_HIGHBITDEPTH

       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {

-        vp10_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

+        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

                               d + y8_idx * dp + x8_idx, dp,

                               &min, &max);

       } else {

-        vp10_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

+        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

                        d + y8_idx * dp + x8_idx, dp,

                        &min, &max);

 #else

-      vp10_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

+      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

                      d + y8_idx * dp + x8_idx, dp,

                      &min, &max);

 #endif

@@ -577,18 +577,18 @@

       int d_avg = 128;

 #if CONFIG_VP9_HIGHBITDEPTH

       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {

-        s_avg = vp10_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);

+        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);

         if (!is_key_frame)

-          d_avg = vp10_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);

+          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);

       } else {

-        s_avg = vp10_avg_4x4(s + y4_idx * sp + x4_idx, sp);

+        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);

         if (!is_key_frame)

-          d_avg = vp10_avg_4x4(d + y4_idx * dp + x4_idx, dp);

+          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);

 #else

-      s_avg = vp10_avg_4x4(s + y4_idx * sp + x4_idx, sp);

+      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);

       if (!is_key_frame)

-        d_avg = vp10_avg_4x4(d + y4_idx * dp + x4_idx, dp);

+        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);

 #endif

       sum = s_avg - d_avg;

       sse = sum * sum;

@@ -616,18 +616,18 @@

       int d_avg = 128;

 #if CONFIG_VP9_HIGHBITDEPTH

       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {

-        s_avg = vp10_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);

+        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);

         if (!is_key_frame)

-          d_avg = vp10_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);

+          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);

       } else {

-        s_avg = vp10_avg_8x8(s + y8_idx * sp + x8_idx, sp);

+        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);

         if (!is_key_frame)

-          d_avg = vp10_avg_8x8(d + y8_idx * dp + x8_idx, dp);

+          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);

 #else

-      s_avg = vp10_avg_8x8(s + y8_idx * sp + x8_idx, sp);

+      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);

       if (!is_key_frame)

-        d_avg = vp10_avg_8x8(d + y8_idx * dp + x8_idx, dp);

+        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);

 #endif

       sum = s_avg - d_avg;

       sse = sum * sum;

--- a/vp10/encoder/mcomp.c

+++ b/vp10/encoder/mcomp.c

@@ -1759,7 +1759,7 @@

   int center, offset = 0;

   int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.

   for (d = 0; d <= bw; d += 16) {

-    this_sad = vp10_vector_var(&ref[d], src, bwl);

+    this_sad = vpx_vector_var(&ref[d], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       offset = d;

@@ -1772,7 +1772,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1785,7 +1785,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1798,7 +1798,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1811,7 +1811,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp10_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1880,13 +1880,13 @@

   // Set up prediction 1-D reference set

   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);

   for (idx = 0; idx < search_width; idx += 16) {

-    vp10_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);

+    vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);

     ref_buf += 16;

   ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;

   for (idx = 0; idx < search_height; ++idx) {

-    vbuf[idx] = vp10_int_pro_col(ref_buf, bw) >> norm_factor;

+    vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;

     ref_buf += ref_stride;

@@ -1893,12 +1893,12 @@

   // Set up src 1-D reference set

   for (idx = 0; idx < bw; idx += 16) {

     src_buf = x->plane[0].src.buf + idx;

-    vp10_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);

+    vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);

   src_buf = x->plane[0].src.buf;

   for (idx = 0; idx < bh; ++idx) {

-    src_vbuf[idx] = vp10_int_pro_col(src_buf, bw) >> norm_factor;

+    src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;

     src_buf += src_stride;

--- a/vp10/encoder/mips/msa/avg_msa.c

+++ /dev/null

@@ -1,56 +1,0 @@

-/*

- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp10_rtcd.h"

-#include "vpx_dsp/mips/macros_msa.h"

-uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {

-  uint32_t sum_out;

-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

-  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;

-  v4u32 sum = { 0 };

-  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);

-  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);

-  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);

-  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);

-  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);

-  sum0 += sum4;

-  sum = __msa_hadd_u_w(sum0, sum0);

-  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);

-  sum = __msa_hadd_u_w(sum0, sum0);

-  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);

-  sum_out = __msa_copy_u_w((v4i32)sum, 0);

-  return sum_out;

-}

-uint32_t vp10_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {

-  uint32_t sum_out;

-  uint32_t src0, src1, src2, src3;

-  v16u8 vec = { 0 };

-  v8u16 sum0;

-  v4u32 sum1;

-  v2u64 sum2;

-  LW4(src, src_stride, src0, src1, src2, src3);

-  INSERT_W4_UB(src0, src1, src2, src3, vec);

-  sum0 = __msa_hadd_u_h(vec, vec);

-  sum1 = __msa_hadd_u_w(sum0, sum0);

-  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);

-  sum1 = __msa_hadd_u_w(sum0, sum0);

-  sum2 = __msa_hadd_u_d(sum1, sum1);

-  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);

-  sum_out = __msa_copy_u_w((v4i32)sum1, 0);

-  return sum_out;

-}

--- a/vp10/encoder/x86/avg_intrin_sse2.c

+++ /dev/null

@@ -1,424 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <emmintrin.h>

-#include "./vp10_rtcd.h"

-#include "vpx_ports/mem.h"

-void vp10_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,

-                         int *min, int *max) {

-  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;

-  u0  = _mm_setzero_si128();

-  // Row 0

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff0 = _mm_max_epi16(diff, negdiff);

-  // Row 1

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);

-  minabsdiff = _mm_min_epi16(absdiff0, absdiff);

-  // Row 2

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 3

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 4

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 5

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 6

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 7

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));

-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));

-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));

-  *max = _mm_extract_epi16(maxabsdiff, 0);

-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));

-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));

-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));

-  *min = _mm_extract_epi16(minabsdiff, 0);

-}

-unsigned int vp10_avg_8x8_sse2(const uint8_t *s, int p) {

-  __m128i s0, s1, u0;

-  unsigned int avg = 0;

-  u0  = _mm_setzero_si128();

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));

-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));

-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));

-  avg = _mm_extract_epi16(s0, 0);

-  return (avg + 32) >> 6;

-}

-unsigned int vp10_avg_4x4_sse2(const uint8_t *s, int p) {

-  __m128i s0, s1, u0;

-  unsigned int avg = 0;

-  u0  = _mm_setzero_si128();

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));

-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));

-  avg = _mm_extract_epi16(s0, 0);

-  return (avg + 8) >> 4;

-}

-static void hadamard_col8_sse2(__m128i *in, int iter) {

-  __m128i a0 = in[0];

-  __m128i a1 = in[1];

-  __m128i a2 = in[2];

-  __m128i a3 = in[3];

-  __m128i a4 = in[4];

-  __m128i a5 = in[5];

-  __m128i a6 = in[6];

-  __m128i a7 = in[7];

-  __m128i b0 = _mm_add_epi16(a0, a1);

-  __m128i b1 = _mm_sub_epi16(a0, a1);

-  __m128i b2 = _mm_add_epi16(a2, a3);

-  __m128i b3 = _mm_sub_epi16(a2, a3);

-  __m128i b4 = _mm_add_epi16(a4, a5);

-  __m128i b5 = _mm_sub_epi16(a4, a5);

-  __m128i b6 = _mm_add_epi16(a6, a7);

-  __m128i b7 = _mm_sub_epi16(a6, a7);

-  a0 = _mm_add_epi16(b0, b2);

-  a1 = _mm_add_epi16(b1, b3);

-  a2 = _mm_sub_epi16(b0, b2);

-  a3 = _mm_sub_epi16(b1, b3);

-  a4 = _mm_add_epi16(b4, b6);

-  a5 = _mm_add_epi16(b5, b7);

-  a6 = _mm_sub_epi16(b4, b6);

-  a7 = _mm_sub_epi16(b5, b7);

-  if (iter == 0) {

-    b0 = _mm_add_epi16(a0, a4);

-    b7 = _mm_add_epi16(a1, a5);

-    b3 = _mm_add_epi16(a2, a6);

-    b4 = _mm_add_epi16(a3, a7);

-    b2 = _mm_sub_epi16(a0, a4);

-    b6 = _mm_sub_epi16(a1, a5);

-    b1 = _mm_sub_epi16(a2, a6);

-    b5 = _mm_sub_epi16(a3, a7);

-    a0 = _mm_unpacklo_epi16(b0, b1);

-    a1 = _mm_unpacklo_epi16(b2, b3);

-    a2 = _mm_unpackhi_epi16(b0, b1);

-    a3 = _mm_unpackhi_epi16(b2, b3);

-    a4 = _mm_unpacklo_epi16(b4, b5);

-    a5 = _mm_unpacklo_epi16(b6, b7);

-    a6 = _mm_unpackhi_epi16(b4, b5);

-    a7 = _mm_unpackhi_epi16(b6, b7);

-    b0 = _mm_unpacklo_epi32(a0, a1);

-    b1 = _mm_unpacklo_epi32(a4, a5);

-    b2 = _mm_unpackhi_epi32(a0, a1);

-    b3 = _mm_unpackhi_epi32(a4, a5);

-    b4 = _mm_unpacklo_epi32(a2, a3);

-    b5 = _mm_unpacklo_epi32(a6, a7);

-    b6 = _mm_unpackhi_epi32(a2, a3);

-    b7 = _mm_unpackhi_epi32(a6, a7);

-    in[0] = _mm_unpacklo_epi64(b0, b1);

-    in[1] = _mm_unpackhi_epi64(b0, b1);

-    in[2] = _mm_unpacklo_epi64(b2, b3);

-    in[3] = _mm_unpackhi_epi64(b2, b3);

-    in[4] = _mm_unpacklo_epi64(b4, b5);

-    in[5] = _mm_unpackhi_epi64(b4, b5);

-    in[6] = _mm_unpacklo_epi64(b6, b7);

-    in[7] = _mm_unpackhi_epi64(b6, b7);

-  } else {

-    in[0] = _mm_add_epi16(a0, a4);

-    in[7] = _mm_add_epi16(a1, a5);

-    in[3] = _mm_add_epi16(a2, a6);

-    in[4] = _mm_add_epi16(a3, a7);

-    in[2] = _mm_sub_epi16(a0, a4);

-    in[6] = _mm_sub_epi16(a1, a5);

-    in[1] = _mm_sub_epi16(a2, a6);

-    in[5] = _mm_sub_epi16(a3, a7);

-  }

-}

-void vp10_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,

-                           int16_t *coeff) {

-  __m128i src[8];

-  src[0] = _mm_load_si128((const __m128i *)src_diff);

-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  hadamard_col8_sse2(src, 0);

-  hadamard_col8_sse2(src, 1);

-  _mm_store_si128((__m128i *)coeff, src[0]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[1]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[2]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[3]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[4]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[5]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[6]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[7]);

-}

-void vp10_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,

-                             int16_t *coeff) {

-  int idx;

-  for (idx = 0; idx < 4; ++idx) {

-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride

-                                + (idx & 0x01) * 8;

-    vp10_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);

-  }

-  for (idx = 0; idx < 64; idx += 8) {

-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);

-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));

-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));

-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));

-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);

-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);

-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);

-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

-    b0 = _mm_srai_epi16(b0, 1);

-    b1 = _mm_srai_epi16(b1, 1);

-    b2 = _mm_srai_epi16(b2, 1);

-    b3 = _mm_srai_epi16(b3, 1);

-    coeff0 = _mm_add_epi16(b0, b2);

-    coeff1 = _mm_add_epi16(b1, b3);

-    _mm_store_si128((__m128i *)coeff, coeff0);

-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);

-    coeff2 = _mm_sub_epi16(b0, b2);

-    coeff3 = _mm_sub_epi16(b1, b3);

-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);

-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);

-    coeff += 8;

-  }

-}

-int16_t vp10_satd_sse2(const int16_t *coeff, int length) {

-  int i;

-  __m128i sum = _mm_load_si128((const __m128i *)coeff);

-  __m128i sign = _mm_srai_epi16(sum, 15);

-  __m128i val = _mm_xor_si128(sum, sign);

-  sum = _mm_sub_epi16(val, sign);

-  coeff += 8;

-  for (i = 8; i < length; i += 8) {

-    __m128i src_line = _mm_load_si128((const __m128i *)coeff);

-    sign = _mm_srai_epi16(src_line, 15);

-    val = _mm_xor_si128(src_line, sign);

-    val = _mm_sub_epi16(val, sign);

-    sum = _mm_add_epi16(sum, val);

-    coeff += 8;

-  }

-  val = _mm_srli_si128(sum, 8);

-  sum = _mm_add_epi16(sum, val);

-  val = _mm_srli_epi64(sum, 32);

-  sum = _mm_add_epi16(sum, val);

-  val = _mm_srli_epi32(sum, 16);

-  sum = _mm_add_epi16(sum, val);

-  return _mm_extract_epi16(sum, 0);

-}

-void vp10_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,

-                          const int ref_stride, const int height) {

-  int idx;

-  __m128i zero = _mm_setzero_si128();

-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);

-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);

-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);

-  __m128i t0, t1;

-  int height_1 = height - 1;

-  ref += ref_stride;

-  for (idx = 1; idx < height_1; idx += 2) {

-    src_line = _mm_loadu_si128((const __m128i *)ref);

-    t0 = _mm_unpacklo_epi8(src_line, zero);

-    t1 = _mm_unpackhi_epi8(src_line, zero);

-    s0 = _mm_adds_epu16(s0, t0);

-    s1 = _mm_adds_epu16(s1, t1);

-    ref += ref_stride;

-    src_line = _mm_loadu_si128((const __m128i *)ref);

-    t0 = _mm_unpacklo_epi8(src_line, zero);

-    t1 = _mm_unpackhi_epi8(src_line, zero);

-    s0 = _mm_adds_epu16(s0, t0);

-    s1 = _mm_adds_epu16(s1, t1);

-    ref += ref_stride;

-  }

-  src_line = _mm_loadu_si128((const __m128i *)ref);

-  t0 = _mm_unpacklo_epi8(src_line, zero);

-  t1 = _mm_unpackhi_epi8(src_line, zero);

-  s0 = _mm_adds_epu16(s0, t0);

-  s1 = _mm_adds_epu16(s1, t1);

-  if (height == 64) {

-    s0 = _mm_srai_epi16(s0, 5);

-    s1 = _mm_srai_epi16(s1, 5);

-  } else if (height == 32) {

-    s0 = _mm_srai_epi16(s0, 4);

-    s1 = _mm_srai_epi16(s1, 4);

-  } else {

-    s0 = _mm_srai_epi16(s0, 3);

-    s1 = _mm_srai_epi16(s1, 3);

-  }

-  _mm_storeu_si128((__m128i *)hbuf, s0);

-  hbuf += 8;

-  _mm_storeu_si128((__m128i *)hbuf, s1);

-}

-int16_t vp10_int_pro_col_sse2(uint8_t const *ref, const int width) {

-  __m128i zero = _mm_setzero_si128();

-  __m128i src_line = _mm_load_si128((const __m128i *)ref);

-  __m128i s0 = _mm_sad_epu8(src_line, zero);

-  __m128i s1;

-  int i;

-  for (i = 16; i < width; i += 16) {

-    ref += 16;

-    src_line = _mm_load_si128((const __m128i *)ref);

-    s1 = _mm_sad_epu8(src_line, zero);

-    s0 = _mm_adds_epu16(s0, s1);

-  }

-  s1 = _mm_srli_si128(s0, 8);

-  s0 = _mm_adds_epu16(s0, s1);

-  return _mm_extract_epi16(s0, 0);

-}

-int vp10_vector_var_sse2(int16_t const *ref, int16_t const *src,

-                        const int bwl) {

-  int idx;

-  int width = 4 << bwl;

-  int16_t mean;

-  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);

-  __m128i v1 = _mm_load_si128((const __m128i *)src);

-  __m128i diff = _mm_subs_epi16(v0, v1);

-  __m128i sum = diff;

-  __m128i sse = _mm_madd_epi16(diff, diff);

-  ref += 8;

-  src += 8;

-  for (idx = 8; idx < width; idx += 8) {

-    v0 = _mm_loadu_si128((const __m128i *)ref);

-    v1 = _mm_load_si128((const __m128i *)src);

-    diff = _mm_subs_epi16(v0, v1);

-    sum = _mm_add_epi16(sum, diff);

-    v0  = _mm_madd_epi16(diff, diff);

-    sse = _mm_add_epi32(sse, v0);

-    ref += 8;

-    src += 8;

-  }

-  v0  = _mm_srli_si128(sum, 8);

-  sum = _mm_add_epi16(sum, v0);

-  v0  = _mm_srli_epi64(sum, 32);

-  sum = _mm_add_epi16(sum, v0);

-  v0  = _mm_srli_epi32(sum, 16);

-  sum = _mm_add_epi16(sum, v0);

-  v1  = _mm_srli_si128(sse, 8);

-  sse = _mm_add_epi32(sse, v1);

-  v1  = _mm_srli_epi64(sse, 32);

-  sse = _mm_add_epi32(sse, v1);

-  mean = _mm_extract_epi16(sum, 0);

-  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));

-}

--- a/vp10/encoder/x86/dct_ssse3_x86_64.asm

+++ /dev/null

@@ -1,121 +1,0 @@

-;

-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%define private_prefix vp10

-%include "third_party/x86inc/x86inc.asm"

-; This file provides SSSE3 version of the forward transformation. Part

-; of the macro definitions are originally derived from the ffmpeg project.

-; The current version applies to x86 64-bit only.

-SECTION .text

-%if ARCH_X86_64

-; matrix transpose

-%macro INTERLEAVE_2X 4

-  punpckh%1          m%4, m%2, m%3

-  punpckl%1          m%2, m%3

-  SWAP               %3,  %4

-%endmacro

-%macro TRANSPOSE8X8 9

-  INTERLEAVE_2X  wd, %1, %2, %9

-  INTERLEAVE_2X  wd, %3, %4, %9

-  INTERLEAVE_2X  wd, %5, %6, %9

-  INTERLEAVE_2X  wd, %7, %8, %9

-  INTERLEAVE_2X  dq, %1, %3, %9

-  INTERLEAVE_2X  dq, %2, %4, %9

-  INTERLEAVE_2X  dq, %5, %7, %9

-  INTERLEAVE_2X  dq, %6, %8, %9

-  INTERLEAVE_2X  qdq, %1, %5, %9

-  INTERLEAVE_2X  qdq, %3, %7, %9

-  INTERLEAVE_2X  qdq, %2, %6, %9

-  INTERLEAVE_2X  qdq, %4, %8, %9

-  SWAP  %2, %5

-  SWAP  %4, %7

-%endmacro

-%macro HMD8_1D 0

-  psubw              m8, m0, m1

-  psubw              m9, m2, m3

-  paddw              m0, m1

-  paddw              m2, m3

-  SWAP               1, 8

-  SWAP               3, 9

-  psubw              m8, m4, m5

-  psubw              m9, m6, m7

-  paddw              m4, m5

-  paddw              m6, m7

-  SWAP               5, 8

-  SWAP               7, 9

-  psubw              m8, m0, m2

-  psubw              m9, m1, m3

-  paddw              m0, m2

-  paddw              m1, m3

-  SWAP               2, 8

-  SWAP               3, 9

-  psubw              m8, m4, m6

-  psubw              m9, m5, m7

-  paddw              m4, m6

-  paddw              m5, m7

-  SWAP               6, 8

-  SWAP               7, 9

-  psubw              m8, m0, m4

-  psubw              m9, m1, m5

-  paddw              m0, m4

-  paddw              m1, m5

-  SWAP               4, 8

-  SWAP               5, 9

-  psubw              m8, m2, m6

-  psubw              m9, m3, m7

-  paddw              m2, m6

-  paddw              m3, m7

-  SWAP               6, 8

-  SWAP               7, 9

-%endmacro

-INIT_XMM ssse3

-cglobal hadamard_8x8, 3, 5, 10, input, stride, output

-  lea                r3, [2 * strideq]

-  lea                r4, [4 * strideq]

-  mova               m0, [inputq]

-  mova               m1, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m2, [inputq]

-  mova               m3, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m4, [inputq]

-  mova               m5, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m6, [inputq]

-  mova               m7, [inputq + r3]

-  HMD8_1D

-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9

-  HMD8_1D

-  mova              [outputq +   0], m0

-  mova              [outputq +  16], m1

-  mova              [outputq +  32], m2

-  mova              [outputq +  48], m3

-  mova              [outputq +  64], m4

-  mova              [outputq +  80], m5

-  mova              [outputq +  96], m6

-  mova              [outputq + 112], m7

-  RET

-%endif

--- a/vp10/vp10cx.mk

+++ b/vp10/vp10cx.mk

@@ -17,7 +17,6 @@

 VP10_CX_SRCS-yes += vp10_cx_iface.c

-VP10_CX_SRCS-yes += encoder/avg.c

 VP10_CX_SRCS-yes += encoder/bitstream.c

 VP10_CX_SRCS-yes += encoder/context_tree.c

 VP10_CX_SRCS-yes += encoder/context_tree.h

@@ -87,7 +86,6 @@

 VP10_CX_SRCS-yes += encoder/mbgraph.c

 VP10_CX_SRCS-yes += encoder/mbgraph.h

-VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/avg_intrin_sse2.c

 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

@@ -102,7 +100,6 @@

 ifeq ($(ARCH_X86_64),yes)

 ifeq ($(CONFIG_USE_X86INC),yes)

 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3_x86_64.asm

-VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3_x86_64.asm

 endif

 endif

@@ -119,10 +116,8 @@

 VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/dct_neon.c

 VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/error_neon.c

 endif

-VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/avg_neon.c

 VP10_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/quantize_neon.c

-VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/avg_msa.c

 VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/error_msa.c

 VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct4x4_msa.c

 VP10_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/fdct8x8_msa.c

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -194,42 +194,6 @@

 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {

-add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";

-specialize qw/vp9_avg_8x8 sse2 neon msa/;

-add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";

-specialize qw/vp9_avg_4x4 sse2 neon msa/;

-add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";

-specialize qw/vp9_minmax_8x8 sse2/;

-add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";

-specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";

-add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";

-specialize qw/vp9_hadamard_16x16 sse2/;

-add_proto qw/int vp9_satd/, "const int16_t *coeff, int length";

-specialize qw/vp9_satd sse2 neon/;

-add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";

-specialize qw/vp9_int_pro_row sse2 neon/;

-add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";

-specialize qw/vp9_int_pro_col sse2 neon/;

-add_proto qw/int vp9_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";

-specialize qw/vp9_vector_var neon sse2/;

-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

-  add_proto qw/unsigned int vp9_highbd_avg_8x8/, "const uint8_t *, int p";

-  specialize qw/vp9_highbd_avg_8x8/;

-  add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";

-  specialize qw/vp9_highbd_avg_4x4/;

-  add_proto qw/void vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";

-  specialize qw/vp9_highbd_minmax_8x8/;

-}

 # ENCODEMB INVOKE

--- a/vp9/encoder/arm/neon/vp9_avg_neon.c

+++ /dev/null

@@ -1,199 +1,0 @@

-/*

- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include <assert.h>

-#include "./vp9_rtcd.h"

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {

-  const uint32x4_t a = vpaddlq_u16(v_16x8);

-  const uint64x2_t b = vpaddlq_u32(a);

-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

-                                vreinterpret_u32_u64(vget_high_u64(b)));

-  return vget_lane_u32(c, 0);

-}

-unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {

-  uint16x8_t v_sum;

-  uint32x2_t v_s0 = vdup_n_u32(0);

-  uint32x2_t v_s1 = vdup_n_u32(0);

-  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);

-  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);

-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);

-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);

-  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));

-  return (horizontal_add_u16x8(v_sum) + 8) >> 4;

-}

-unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {

-  uint8x8_t v_s0 = vld1_u8(s);

-  const uint8x8_t v_s1 = vld1_u8(s + p);

-  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);

-  v_s0 = vld1_u8(s + 2 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 3 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 4 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 5 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 6 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  v_s0 = vld1_u8(s + 7 * p);

-  v_sum = vaddw_u8(v_sum, v_s0);

-  return (horizontal_add_u16x8(v_sum) + 32) >> 6;

-}

-// coeff: 16 bits, dynamic range [-32640, 32640].

-// length: value range {16, 64, 256, 1024}.

-int vp9_satd_neon(const int16_t *coeff, int length) {

-  const int16x4_t zero = vdup_n_s16(0);

-  int32x4_t accum = vdupq_n_s32(0);

-  do {

-    const int16x8_t src0 = vld1q_s16(coeff);

-    const int16x8_t src8 = vld1q_s16(coeff + 8);

-    accum = vabal_s16(accum, vget_low_s16(src0), zero);

-    accum = vabal_s16(accum, vget_high_s16(src0), zero);

-    accum = vabal_s16(accum, vget_low_s16(src8), zero);

-    accum = vabal_s16(accum, vget_high_s16(src8), zero);

-    length -= 16;

-    coeff += 16;

-  } while (length != 0);

-  {

-    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]

-    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.

-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),

-                                  vreinterpret_s32_s64(vget_high_s64(s0)));

-    const int satd = vget_lane_s32(s1, 0);

-    return satd;

-  }

-}

-void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,

-                          const int ref_stride, const int height) {

-  int i;

-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);

-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);

-  const int shift_factor = ((height >> 5) + 3) * -1;

-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);

-  for (i = 0; i < height; i += 8) {

-    const uint8x16_t vec_row1 = vld1q_u8(ref);

-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);

-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);

-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);

-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);

-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);

-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);

-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));

-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));

-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));

-    ref += ref_stride * 8;

-  }

-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);

-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);

-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));

-  hbuf += 8;

-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));

-}

-int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) {

-  int i;

-  uint16x8_t vec_sum = vdupq_n_u16(0);

-  for (i = 0; i < width; i += 16) {

-    const uint8x16_t vec_row = vld1q_u8(ref);

-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));

-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));

-    ref += 16;

-  }

-  return horizontal_add_u16x8(vec_sum);

-}

-// ref, src = [0, 510] - max diff = 16-bits

-// bwl = {2, 3, 4}, width = {16, 32, 64}

-int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {

-  int width = 4 << bwl;

-  int32x4_t sse = vdupq_n_s32(0);

-  int16x8_t total = vdupq_n_s16(0);

-  assert(width >= 8);

-  assert((width % 8) == 0);

-  do {

-    const int16x8_t r = vld1q_s16(ref);

-    const int16x8_t s = vld1q_s16(src);

-    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.

-    const int16x4_t diff_lo = vget_low_s16(diff);

-    const int16x4_t diff_hi = vget_high_s16(diff);

-    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.

-    sse = vmlal_s16(sse, diff_hi, diff_hi);

-    total = vaddq_s16(total, diff);  // dynamic range 16 bits.

-    ref += 8;

-    src += 8;

-    width -= 8;

-  } while (width != 0);

-  {

-    // Note: 'total''s pairwise addition could be implemented similarly to

-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired

-    // with the summation of 'sse' performed better on a Cortex-A15.

-    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'

-    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));

-    const int32x2_t t2 = vpadd_s32(t1, t1);

-    const int t = vget_lane_s32(t2, 0);

-    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.

-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),

-                                  vreinterpret_s32_s64(vget_high_s64(s0)));

-    const int s = vget_lane_s32(s1, 0);

-    const int shift_factor = bwl + 2;

-    return s - ((t * t) >> shift_factor);

-  }

-}

--- a/vp9/encoder/mips/msa/vp9_avg_msa.c

+++ /dev/null

@@ -1,56 +1,0 @@

-/*

- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp9_rtcd.h"

-#include "vpx_dsp/mips/macros_msa.h"

-uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {

-  uint32_t sum_out;

-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

-  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;

-  v4u32 sum = { 0 };

-  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);

-  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);

-  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);

-  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);

-  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);

-  sum0 += sum4;

-  sum = __msa_hadd_u_w(sum0, sum0);

-  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);

-  sum = __msa_hadd_u_w(sum0, sum0);

-  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);

-  sum_out = __msa_copy_u_w((v4i32)sum, 0);

-  return sum_out;

-}

-uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {

-  uint32_t sum_out;

-  uint32_t src0, src1, src2, src3;

-  v16u8 vec = { 0 };

-  v8u16 sum0;

-  v4u32 sum1;

-  v2u64 sum2;

-  LW4(src, src_stride, src0, src1, src2, src3);

-  INSERT_W4_UB(src0, src1, src2, src3, vec);

-  sum0 = __msa_hadd_u_h(vec, vec);

-  sum1 = __msa_hadd_u_w(sum0, sum0);

-  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);

-  sum1 = __msa_hadd_u_w(sum0, sum0);

-  sum2 = __msa_hadd_u_d(sum1, sum1);

-  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);

-  sum_out = __msa_copy_u_w((v4i32)sum1, 0);

-  return sum_out;

-}

--- a/vp9/encoder/vp9_avg.c

+++ /dev/null

@@ -1,230 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp9_rtcd.h"

-#include "vp9/common/vp9_common.h"

-#include "vpx_ports/mem.h"

-unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) {

-  int i, j;

-  int sum = 0;

-  for (i = 0; i < 8; ++i, s+=p)

-    for (j = 0; j < 8; sum += s[j], ++j) {}

-  return (sum + 32) >> 6;

-}

-unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) {

-  int i, j;

-  int sum = 0;

-  for (i = 0; i < 4; ++i, s+=p)

-    for (j = 0; j < 4; sum += s[j], ++j) {}

-  return (sum + 8) >> 4;

-}

-// src_diff: first pass, 9 bit, dynamic range [-255, 255]

-//           second pass, 12 bit, dynamic range [-2040, 2040]

-static void hadamard_col8(const int16_t *src_diff, int src_stride,

-                          int16_t *coeff) {

-  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];

-  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];

-  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];

-  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];

-  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];

-  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];

-  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];

-  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];

-  int16_t c0 = b0 + b2;

-  int16_t c1 = b1 + b3;

-  int16_t c2 = b0 - b2;

-  int16_t c3 = b1 - b3;

-  int16_t c4 = b4 + b6;

-  int16_t c5 = b5 + b7;

-  int16_t c6 = b4 - b6;

-  int16_t c7 = b5 - b7;

-  coeff[0] = c0 + c4;

-  coeff[7] = c1 + c5;

-  coeff[3] = c2 + c6;

-  coeff[4] = c3 + c7;

-  coeff[2] = c0 - c4;

-  coeff[6] = c1 - c5;

-  coeff[1] = c2 - c6;

-  coeff[5] = c3 - c7;

-}

-void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,

-                        int16_t *coeff) {

-  int idx;

-  int16_t buffer[64];

-  int16_t *tmp_buf = &buffer[0];

-  for (idx = 0; idx < 8; ++idx) {

-    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit

-                                                   // dynamic range [-255, 255]

-    tmp_buf += 8;

-    ++src_diff;

-  }

-  tmp_buf = &buffer[0];

-  for (idx = 0; idx < 8; ++idx) {

-    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit

-                                       // dynamic range [-2040, 2040]

-    coeff += 8;  // coeff: 15 bit

-                 // dynamic range [-16320, 16320]

-    ++tmp_buf;

-  }

-}

-// In place 16x16 2D Hadamard transform

-void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,

-                          int16_t *coeff) {

-  int idx;

-  for (idx = 0; idx < 4; ++idx) {

-    // src_diff: 9 bit, dynamic range [-255, 255]

-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride

-                                + (idx & 0x01) * 8;

-    vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);

-  }

-  // coeff: 15 bit, dynamic range [-16320, 16320]

-  for (idx = 0; idx < 64; ++idx) {

-    int16_t a0 = coeff[0];

-    int16_t a1 = coeff[64];

-    int16_t a2 = coeff[128];

-    int16_t a3 = coeff[192];

-    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]

-    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range

-    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]

-    int16_t b3 = (a2 - a3) >> 1;

-    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]

-    coeff[64]  = b1 + b3;

-    coeff[128] = b0 - b2;

-    coeff[192] = b1 - b3;

-    ++coeff;

-  }

-}

-// coeff: 16 bits, dynamic range [-32640, 32640].

-// length: value range {16, 64, 256, 1024}.

-int vp9_satd_c(const int16_t *coeff, int length) {

-  int i;

-  int satd = 0;

-  for (i = 0; i < length; ++i)

-    satd += abs(coeff[i]);

-  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]

-  return satd;

-}

-// Integer projection onto row vectors.

-// height: value range {16, 32, 64}.

-void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,

-                       const int ref_stride, const int height) {

-  int idx;

-  const int norm_factor = height >> 1;

-  for (idx = 0; idx < 16; ++idx) {

-    int i;

-    hbuf[idx] = 0;

-    // hbuf[idx]: 14 bit, dynamic range [0, 16320].

-    for (i = 0; i < height; ++i)

-      hbuf[idx] += ref[i * ref_stride];

-    // hbuf[idx]: 9 bit, dynamic range [0, 510].

-    hbuf[idx] /= norm_factor;

-    ++ref;

-  }

-}

-// width: value range {16, 32, 64}.

-int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) {

-  int idx;

-  int16_t sum = 0;

-  // sum: 14 bit, dynamic range [0, 16320]

-  for (idx = 0; idx < width; ++idx)

-    sum += ref[idx];

-  return sum;

-}

-// ref: [0 - 510]

-// src: [0 - 510]

-// bwl: {2, 3, 4}

-int vp9_vector_var_c(int16_t const *ref, int16_t const *src,

-                     const int bwl) {

-  int i;

-  int width = 4 << bwl;

-  int sse = 0, mean = 0, var;

-  for (i = 0; i < width; ++i) {

-    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.

-    mean += diff;                // mean: dynamic range 16 bits.

-    sse += diff * diff;          // sse:  dynamic range 26 bits.

-  }

-  // (mean * mean): dynamic range 31 bits.

-  var = sse - ((mean * mean) >> (bwl + 2));

-  return var;

-}

-void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,

-                      int *min, int *max) {

-  int i, j;

-  *min = 255;

-  *max = 0;

-  for (i = 0; i < 8; ++i, s += p, d += dp) {

-    for (j = 0; j < 8; ++j) {

-      int diff = abs(s[j]-d[j]);

-      *min = diff < *min ? diff : *min;

-      *max = diff > *max ? diff : *max;

-    }

-  }

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {

-  int i, j;

-  int sum = 0;

-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

-  for (i = 0; i < 8; ++i, s+=p)

-    for (j = 0; j < 8; sum += s[j], ++j) {}

-  return (sum + 32) >> 6;

-}

-unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) {

-  int i, j;

-  int sum = 0;

-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

-  for (i = 0; i < 4; ++i, s+=p)

-    for (j = 0; j < 4; sum += s[j], ++j) {}

-  return (sum + 8) >> 4;

-}

-void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,

-                             int dp, int *min, int *max) {

-  int i, j;

-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

-  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);

-  *min = 255;

-  *max = 0;

-  for (i = 0; i < 8; ++i, s += p, d += dp) {

-    for (j = 0; j < 8; ++j) {

-      int diff = abs(s[j]-d[j]);

-      *min = diff < *min ? diff : *min;

-      *max = diff > *max ? diff : *max;

-    }

-  }

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -556,16 +556,16 @@

     if (x8_idx < pixels_wide && y8_idx < pixels_high) {

 #if CONFIG_VP9_HIGHBITDEPTH

       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {

-        vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

+        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

                               d + y8_idx * dp + x8_idx, dp,

                               &min, &max);

       } else {

-        vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

+        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

                        d + y8_idx * dp + x8_idx, dp,

                        &min, &max);

 #else

-      vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

+      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,

                      d + y8_idx * dp + x8_idx, dp,

                      &min, &max);

 #endif

@@ -597,18 +597,18 @@

       int d_avg = 128;

 #if CONFIG_VP9_HIGHBITDEPTH

       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {

-        s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);

+        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);

         if (!is_key_frame)

-          d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);

+          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);

       } else {

-        s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);

+        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);

         if (!is_key_frame)

-          d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);

+          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);

 #else

-      s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);

+      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);

       if (!is_key_frame)

-        d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);

+        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);

 #endif

       sum = s_avg - d_avg;

       sse = sum * sum;

@@ -636,18 +636,18 @@

       int d_avg = 128;

 #if CONFIG_VP9_HIGHBITDEPTH

       if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {

-        s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);

+        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);

         if (!is_key_frame)

-          d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);

+          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);

       } else {

-        s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);

+        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);

         if (!is_key_frame)

-          d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);

+          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);

 #else

-      s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);

+      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);

       if (!is_key_frame)

-        d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);

+        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);

 #endif

       sum = s_avg - d_avg;

       sse = sum * sum;

--- a/vp9/encoder/vp9_mcomp.c

+++ b/vp9/encoder/vp9_mcomp.c

@@ -1755,7 +1755,7 @@

   int center, offset = 0;

   int bw = 4 << bwl;  // redundant variable, to be changed in the experiments.

   for (d = 0; d <= bw; d += 16) {

-    this_sad = vp9_vector_var(&ref[d], src, bwl);

+    this_sad = vpx_vector_var(&ref[d], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       offset = d;

@@ -1768,7 +1768,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1781,7 +1781,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1794,7 +1794,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1807,7 +1807,7 @@

     // check limit

     if (this_pos < 0 || this_pos > bw)

       continue;

-    this_sad = vp9_vector_var(&ref[this_pos], src, bwl);

+    this_sad = vpx_vector_var(&ref[this_pos], src, bwl);

     if (this_sad < best_sad) {

       best_sad = this_sad;

       center = this_pos;

@@ -1876,13 +1876,13 @@

   // Set up prediction 1-D reference set

   ref_buf = xd->plane[0].pre[0].buf - (bw >> 1);

   for (idx = 0; idx < search_width; idx += 16) {

-    vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);

+    vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh);

     ref_buf += 16;

   ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride;

   for (idx = 0; idx < search_height; ++idx) {

-    vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor;

+    vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor;

     ref_buf += ref_stride;

@@ -1889,12 +1889,12 @@

   // Set up src 1-D reference set

   for (idx = 0; idx < bw; idx += 16) {

     src_buf = x->plane[0].src.buf + idx;

-    vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);

+    vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh);

   src_buf = x->plane[0].src.buf;

   for (idx = 0; idx < bh; ++idx) {

-    src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor;

+    src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor;

     src_buf += src_stride;

--- a/vp9/encoder/vp9_pickmode.c

+++ b/vp9/encoder/vp9_pickmode.c

@@ -619,7 +619,7 @@

                                   scan_order->scan, scan_order->iscan);

             break;

           case TX_16X16:

-            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);

+            vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);

             vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,

                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

                             pd->dequant, eob,

@@ -626,7 +626,7 @@

                             scan_order->scan, scan_order->iscan);

             break;

           case TX_8X8:

-            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);

+            vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);

             vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,

                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,

                             pd->dequant, eob,

@@ -673,7 +673,7 @@

         if (*eob == 1)

           *rate += (int)abs(qcoeff[0]);

         else if (*eob > 1)

-          *rate += vp9_satd((const int16_t *)qcoeff, step << 4);

+          *rate += vpx_satd((const int16_t *)qcoeff, step << 4);

         *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;

--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c

+++ /dev/null

@@ -1,423 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <emmintrin.h>

-#include "./vp9_rtcd.h"

-#include "vpx_ports/mem.h"

-void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,

-                         int *min, int *max) {

-  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;

-  u0  = _mm_setzero_si128();

-  // Row 0

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff0 = _mm_max_epi16(diff, negdiff);

-  // Row 1

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);

-  minabsdiff = _mm_min_epi16(absdiff0, absdiff);

-  // Row 2

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 3

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 4

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 5

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 6

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  // Row 7

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);

-  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);

-  diff = _mm_subs_epi16(s0, d0);

-  negdiff = _mm_subs_epi16(u0, diff);

-  absdiff = _mm_max_epi16(diff, negdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

-  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));

-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));

-  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));

-  *max = _mm_extract_epi16(maxabsdiff, 0);

-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));

-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));

-  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));

-  *min = _mm_extract_epi16(minabsdiff, 0);

-}

-unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {

-  __m128i s0, s1, u0;

-  unsigned int avg = 0;

-  u0  = _mm_setzero_si128();

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));

-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));

-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));

-  avg = _mm_extract_epi16(s0, 0);

-  return (avg + 32) >> 6;

-}

-unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) {

-  __m128i s0, s1, u0;

-  unsigned int avg = 0;

-  u0  = _mm_setzero_si128();

-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

-  s0 = _mm_adds_epu16(s0, s1);

-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));

-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));

-  avg = _mm_extract_epi16(s0, 0);

-  return (avg + 8) >> 4;

-}

-static void hadamard_col8_sse2(__m128i *in, int iter) {

-  __m128i a0 = in[0];

-  __m128i a1 = in[1];

-  __m128i a2 = in[2];

-  __m128i a3 = in[3];

-  __m128i a4 = in[4];

-  __m128i a5 = in[5];

-  __m128i a6 = in[6];

-  __m128i a7 = in[7];

-  __m128i b0 = _mm_add_epi16(a0, a1);

-  __m128i b1 = _mm_sub_epi16(a0, a1);

-  __m128i b2 = _mm_add_epi16(a2, a3);

-  __m128i b3 = _mm_sub_epi16(a2, a3);

-  __m128i b4 = _mm_add_epi16(a4, a5);

-  __m128i b5 = _mm_sub_epi16(a4, a5);

-  __m128i b6 = _mm_add_epi16(a6, a7);

-  __m128i b7 = _mm_sub_epi16(a6, a7);

-  a0 = _mm_add_epi16(b0, b2);

-  a1 = _mm_add_epi16(b1, b3);

-  a2 = _mm_sub_epi16(b0, b2);

-  a3 = _mm_sub_epi16(b1, b3);

-  a4 = _mm_add_epi16(b4, b6);

-  a5 = _mm_add_epi16(b5, b7);

-  a6 = _mm_sub_epi16(b4, b6);

-  a7 = _mm_sub_epi16(b5, b7);

-  if (iter == 0) {

-    b0 = _mm_add_epi16(a0, a4);

-    b7 = _mm_add_epi16(a1, a5);

-    b3 = _mm_add_epi16(a2, a6);

-    b4 = _mm_add_epi16(a3, a7);

-    b2 = _mm_sub_epi16(a0, a4);

-    b6 = _mm_sub_epi16(a1, a5);

-    b1 = _mm_sub_epi16(a2, a6);

-    b5 = _mm_sub_epi16(a3, a7);

-    a0 = _mm_unpacklo_epi16(b0, b1);

-    a1 = _mm_unpacklo_epi16(b2, b3);

-    a2 = _mm_unpackhi_epi16(b0, b1);

-    a3 = _mm_unpackhi_epi16(b2, b3);

-    a4 = _mm_unpacklo_epi16(b4, b5);

-    a5 = _mm_unpacklo_epi16(b6, b7);

-    a6 = _mm_unpackhi_epi16(b4, b5);

-    a7 = _mm_unpackhi_epi16(b6, b7);

-    b0 = _mm_unpacklo_epi32(a0, a1);

-    b1 = _mm_unpacklo_epi32(a4, a5);

-    b2 = _mm_unpackhi_epi32(a0, a1);

-    b3 = _mm_unpackhi_epi32(a4, a5);

-    b4 = _mm_unpacklo_epi32(a2, a3);

-    b5 = _mm_unpacklo_epi32(a6, a7);

-    b6 = _mm_unpackhi_epi32(a2, a3);

-    b7 = _mm_unpackhi_epi32(a6, a7);

-    in[0] = _mm_unpacklo_epi64(b0, b1);

-    in[1] = _mm_unpackhi_epi64(b0, b1);

-    in[2] = _mm_unpacklo_epi64(b2, b3);

-    in[3] = _mm_unpackhi_epi64(b2, b3);

-    in[4] = _mm_unpacklo_epi64(b4, b5);

-    in[5] = _mm_unpackhi_epi64(b4, b5);

-    in[6] = _mm_unpacklo_epi64(b6, b7);

-    in[7] = _mm_unpackhi_epi64(b6, b7);

-  } else {

-    in[0] = _mm_add_epi16(a0, a4);

-    in[7] = _mm_add_epi16(a1, a5);

-    in[3] = _mm_add_epi16(a2, a6);

-    in[4] = _mm_add_epi16(a3, a7);

-    in[2] = _mm_sub_epi16(a0, a4);

-    in[6] = _mm_sub_epi16(a1, a5);

-    in[1] = _mm_sub_epi16(a2, a6);

-    in[5] = _mm_sub_epi16(a3, a7);

-  }

-}

-void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,

-                           int16_t *coeff) {

-  __m128i src[8];

-  src[0] = _mm_load_si128((const __m128i *)src_diff);

-  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

-  hadamard_col8_sse2(src, 0);

-  hadamard_col8_sse2(src, 1);

-  _mm_store_si128((__m128i *)coeff, src[0]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[1]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[2]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[3]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[4]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[5]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[6]);

-  coeff += 8;

-  _mm_store_si128((__m128i *)coeff, src[7]);

-}

-void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,

-                             int16_t *coeff) {

-  int idx;

-  for (idx = 0; idx < 4; ++idx) {

-    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride

-                                + (idx & 0x01) * 8;

-    vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);

-  }

-  for (idx = 0; idx < 64; idx += 8) {

-    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);

-    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));

-    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));

-    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));

-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);

-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);

-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);

-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

-    b0 = _mm_srai_epi16(b0, 1);

-    b1 = _mm_srai_epi16(b1, 1);

-    b2 = _mm_srai_epi16(b2, 1);

-    b3 = _mm_srai_epi16(b3, 1);

-    coeff0 = _mm_add_epi16(b0, b2);

-    coeff1 = _mm_add_epi16(b1, b3);

-    _mm_store_si128((__m128i *)coeff, coeff0);

-    _mm_store_si128((__m128i *)(coeff + 64), coeff1);

-    coeff2 = _mm_sub_epi16(b0, b2);

-    coeff3 = _mm_sub_epi16(b1, b3);

-    _mm_store_si128((__m128i *)(coeff + 128), coeff2);

-    _mm_store_si128((__m128i *)(coeff + 192), coeff3);

-    coeff += 8;

-  }

-}

-int vp9_satd_sse2(const int16_t *coeff, int length) {

-  int i;

-  const __m128i zero = _mm_setzero_si128();

-  __m128i accum = zero;

-  for (i = 0; i < length; i += 8) {

-    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);

-    const __m128i inv = _mm_sub_epi16(zero, src_line);

-    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)

-    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);

-    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);

-    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);

-    accum = _mm_add_epi32(accum, sum);

-    coeff += 8;

-  }

-  {  // cascading summation of accum

-    __m128i hi = _mm_srli_si128(accum, 8);

-    accum = _mm_add_epi32(accum, hi);

-    hi = _mm_srli_epi64(accum, 32);

-    accum = _mm_add_epi32(accum, hi);

-  }

-  return _mm_cvtsi128_si32(accum);

-}

-void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,

-                          const int ref_stride, const int height) {

-  int idx;

-  __m128i zero = _mm_setzero_si128();

-  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);

-  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);

-  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);

-  __m128i t0, t1;

-  int height_1 = height - 1;

-  ref += ref_stride;

-  for (idx = 1; idx < height_1; idx += 2) {

-    src_line = _mm_loadu_si128((const __m128i *)ref);

-    t0 = _mm_unpacklo_epi8(src_line, zero);

-    t1 = _mm_unpackhi_epi8(src_line, zero);

-    s0 = _mm_adds_epu16(s0, t0);

-    s1 = _mm_adds_epu16(s1, t1);

-    ref += ref_stride;

-    src_line = _mm_loadu_si128((const __m128i *)ref);

-    t0 = _mm_unpacklo_epi8(src_line, zero);

-    t1 = _mm_unpackhi_epi8(src_line, zero);

-    s0 = _mm_adds_epu16(s0, t0);

-    s1 = _mm_adds_epu16(s1, t1);

-    ref += ref_stride;

-  }

-  src_line = _mm_loadu_si128((const __m128i *)ref);

-  t0 = _mm_unpacklo_epi8(src_line, zero);

-  t1 = _mm_unpackhi_epi8(src_line, zero);

-  s0 = _mm_adds_epu16(s0, t0);

-  s1 = _mm_adds_epu16(s1, t1);

-  if (height == 64) {

-    s0 = _mm_srai_epi16(s0, 5);

-    s1 = _mm_srai_epi16(s1, 5);

-  } else if (height == 32) {

-    s0 = _mm_srai_epi16(s0, 4);

-    s1 = _mm_srai_epi16(s1, 4);

-  } else {

-    s0 = _mm_srai_epi16(s0, 3);

-    s1 = _mm_srai_epi16(s1, 3);

-  }

-  _mm_storeu_si128((__m128i *)hbuf, s0);

-  hbuf += 8;

-  _mm_storeu_si128((__m128i *)hbuf, s1);

-}

-int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) {

-  __m128i zero = _mm_setzero_si128();

-  __m128i src_line = _mm_load_si128((const __m128i *)ref);

-  __m128i s0 = _mm_sad_epu8(src_line, zero);

-  __m128i s1;

-  int i;

-  for (i = 16; i < width; i += 16) {

-    ref += 16;

-    src_line = _mm_load_si128((const __m128i *)ref);

-    s1 = _mm_sad_epu8(src_line, zero);

-    s0 = _mm_adds_epu16(s0, s1);

-  }

-  s1 = _mm_srli_si128(s0, 8);

-  s0 = _mm_adds_epu16(s0, s1);

-  return _mm_extract_epi16(s0, 0);

-}

-int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src,

-                        const int bwl) {

-  int idx;

-  int width = 4 << bwl;

-  int16_t mean;

-  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);

-  __m128i v1 = _mm_load_si128((const __m128i *)src);

-  __m128i diff = _mm_subs_epi16(v0, v1);

-  __m128i sum = diff;

-  __m128i sse = _mm_madd_epi16(diff, diff);

-  ref += 8;

-  src += 8;

-  for (idx = 8; idx < width; idx += 8) {

-    v0 = _mm_loadu_si128((const __m128i *)ref);

-    v1 = _mm_load_si128((const __m128i *)src);

-    diff = _mm_subs_epi16(v0, v1);

-    sum = _mm_add_epi16(sum, diff);

-    v0  = _mm_madd_epi16(diff, diff);

-    sse = _mm_add_epi32(sse, v0);

-    ref += 8;

-    src += 8;

-  }

-  v0  = _mm_srli_si128(sum, 8);

-  sum = _mm_add_epi16(sum, v0);

-  v0  = _mm_srli_epi64(sum, 32);

-  sum = _mm_add_epi16(sum, v0);

-  v0  = _mm_srli_epi32(sum, 16);

-  sum = _mm_add_epi16(sum, v0);

-  v1  = _mm_srli_si128(sse, 8);

-  sse = _mm_add_epi32(sse, v1);

-  v1  = _mm_srli_epi64(sse, 32);

-  sse = _mm_add_epi32(sse, v1);

-  mean = _mm_extract_epi16(sum, 0);

-  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));

-}

--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm

+++ /dev/null

@@ -1,121 +1,0 @@

-;

-;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%define private_prefix vp9

-%include "third_party/x86inc/x86inc.asm"

-; This file provides SSSE3 version of the forward transformation. Part

-; of the macro definitions are originally derived from the ffmpeg project.

-; The current version applies to x86 64-bit only.

-SECTION .text

-%if ARCH_X86_64

-; matrix transpose

-%macro INTERLEAVE_2X 4

-  punpckh%1          m%4, m%2, m%3

-  punpckl%1          m%2, m%3

-  SWAP               %3,  %4

-%endmacro

-%macro TRANSPOSE8X8 9

-  INTERLEAVE_2X  wd, %1, %2, %9

-  INTERLEAVE_2X  wd, %3, %4, %9

-  INTERLEAVE_2X  wd, %5, %6, %9

-  INTERLEAVE_2X  wd, %7, %8, %9

-  INTERLEAVE_2X  dq, %1, %3, %9

-  INTERLEAVE_2X  dq, %2, %4, %9

-  INTERLEAVE_2X  dq, %5, %7, %9

-  INTERLEAVE_2X  dq, %6, %8, %9

-  INTERLEAVE_2X  qdq, %1, %5, %9

-  INTERLEAVE_2X  qdq, %3, %7, %9

-  INTERLEAVE_2X  qdq, %2, %6, %9

-  INTERLEAVE_2X  qdq, %4, %8, %9

-  SWAP  %2, %5

-  SWAP  %4, %7

-%endmacro

-%macro HMD8_1D 0

-  psubw              m8, m0, m1

-  psubw              m9, m2, m3

-  paddw              m0, m1

-  paddw              m2, m3

-  SWAP               1, 8

-  SWAP               3, 9

-  psubw              m8, m4, m5

-  psubw              m9, m6, m7

-  paddw              m4, m5

-  paddw              m6, m7

-  SWAP               5, 8

-  SWAP               7, 9

-  psubw              m8, m0, m2

-  psubw              m9, m1, m3

-  paddw              m0, m2

-  paddw              m1, m3

-  SWAP               2, 8

-  SWAP               3, 9

-  psubw              m8, m4, m6

-  psubw              m9, m5, m7

-  paddw              m4, m6

-  paddw              m5, m7

-  SWAP               6, 8

-  SWAP               7, 9

-  psubw              m8, m0, m4

-  psubw              m9, m1, m5

-  paddw              m0, m4

-  paddw              m1, m5

-  SWAP               4, 8

-  SWAP               5, 9

-  psubw              m8, m2, m6

-  psubw              m9, m3, m7

-  paddw              m2, m6

-  paddw              m3, m7

-  SWAP               6, 8

-  SWAP               7, 9

-%endmacro

-INIT_XMM ssse3

-cglobal hadamard_8x8, 3, 5, 10, input, stride, output

-  lea                r3, [2 * strideq]

-  lea                r4, [4 * strideq]

-  mova               m0, [inputq]

-  mova               m1, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m2, [inputq]

-  mova               m3, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m4, [inputq]

-  mova               m5, [inputq + r3]

-  lea                inputq, [inputq + r4]

-  mova               m6, [inputq]

-  mova               m7, [inputq + r3]

-  HMD8_1D

-  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9

-  HMD8_1D

-  mova              [outputq +   0], m0

-  mova              [outputq +  16], m1

-  mova              [outputq +  32], m2

-  mova              [outputq +  48], m3

-  mova              [outputq +  64], m4

-  mova              [outputq +  80], m5

-  mova              [outputq +  96], m6

-  mova              [outputq + 112], m7

-  RET

-%endif

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -17,7 +17,6 @@

 VP9_CX_SRCS-yes += vp9_cx_iface.c

-VP9_CX_SRCS-yes += encoder/vp9_avg.c

 VP9_CX_SRCS-yes += encoder/vp9_bitstream.c

 VP9_CX_SRCS-yes += encoder/vp9_context_tree.c

 VP9_CX_SRCS-yes += encoder/vp9_context_tree.h

@@ -93,7 +92,6 @@

 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c

 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c

 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c

@@ -114,7 +112,6 @@

 ifeq ($(ARCH_X86_64),yes)

 ifeq ($(CONFIG_USE_X86INC),yes)

 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm

-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3_x86_64.asm

 endif

 endif

@@ -131,10 +128,8 @@

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c

 endif

-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c

-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c

--- /dev/null

+++ b/vpx_dsp/arm/avg_neon.c

@@ -1,0 +1,199 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include <assert.h>

+#include "./vpx_dsp_rtcd.h"

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {

+  const uint32x4_t a = vpaddlq_u16(v_16x8);

+  const uint64x2_t b = vpaddlq_u32(a);

+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),

+                                vreinterpret_u32_u64(vget_high_u64(b)));

+  return vget_lane_u32(c, 0);

+}

+unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {

+  uint16x8_t v_sum;

+  uint32x2_t v_s0 = vdup_n_u32(0);

+  uint32x2_t v_s1 = vdup_n_u32(0);

+  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);

+  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);

+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);

+  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);

+  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));

+  return (horizontal_add_u16x8(v_sum) + 8) >> 4;

+}

+unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {

+  uint8x8_t v_s0 = vld1_u8(s);

+  const uint8x8_t v_s1 = vld1_u8(s + p);

+  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);

+  v_s0 = vld1_u8(s + 2 * p);

+  v_sum = vaddw_u8(v_sum, v_s0);

+  v_s0 = vld1_u8(s + 3 * p);

+  v_sum = vaddw_u8(v_sum, v_s0);

+  v_s0 = vld1_u8(s + 4 * p);

+  v_sum = vaddw_u8(v_sum, v_s0);

+  v_s0 = vld1_u8(s + 5 * p);

+  v_sum = vaddw_u8(v_sum, v_s0);

+  v_s0 = vld1_u8(s + 6 * p);

+  v_sum = vaddw_u8(v_sum, v_s0);

+  v_s0 = vld1_u8(s + 7 * p);

+  v_sum = vaddw_u8(v_sum, v_s0);

+  return (horizontal_add_u16x8(v_sum) + 32) >> 6;

+}

+// coeff: 16 bits, dynamic range [-32640, 32640].

+// length: value range {16, 64, 256, 1024}.

+int vpx_satd_neon(const int16_t *coeff, int length) {

+  const int16x4_t zero = vdup_n_s16(0);

+  int32x4_t accum = vdupq_n_s32(0);

+  do {

+    const int16x8_t src0 = vld1q_s16(coeff);

+    const int16x8_t src8 = vld1q_s16(coeff + 8);

+    accum = vabal_s16(accum, vget_low_s16(src0), zero);

+    accum = vabal_s16(accum, vget_high_s16(src0), zero);

+    accum = vabal_s16(accum, vget_low_s16(src8), zero);

+    accum = vabal_s16(accum, vget_high_s16(src8), zero);

+    length -= 16;

+    coeff += 16;

+  } while (length != 0);

+  {

+    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]

+    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.

+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),

+                                  vreinterpret_s32_s64(vget_high_s64(s0)));

+    const int satd = vget_lane_s32(s1, 0);

+    return satd;

+  }

+}

+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,

+                          const int ref_stride, const int height) {

+  int i;

+  uint16x8_t vec_sum_lo = vdupq_n_u16(0);

+  uint16x8_t vec_sum_hi = vdupq_n_u16(0);

+  const int shift_factor = ((height >> 5) + 3) * -1;

+  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);

+  for (i = 0; i < height; i += 8) {

+    const uint8x16_t vec_row1 = vld1q_u8(ref);

+    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);

+    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);

+    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);

+    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);

+    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);

+    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);

+    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));

+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));

+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));

+    ref += ref_stride * 8;

+  }

+  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);

+  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);

+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));

+  hbuf += 8;

+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));

+}

+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {

+  int i;

+  uint16x8_t vec_sum = vdupq_n_u16(0);

+  for (i = 0; i < width; i += 16) {

+    const uint8x16_t vec_row = vld1q_u8(ref);

+    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));

+    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));

+    ref += 16;

+  }

+  return horizontal_add_u16x8(vec_sum);

+}

+// ref, src = [0, 510] - max diff = 16-bits

+// bwl = {2, 3, 4}, width = {16, 32, 64}

+int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {

+  int width = 4 << bwl;

+  int32x4_t sse = vdupq_n_s32(0);

+  int16x8_t total = vdupq_n_s16(0);

+  assert(width >= 8);

+  assert((width % 8) == 0);

+  do {

+    const int16x8_t r = vld1q_s16(ref);

+    const int16x8_t s = vld1q_s16(src);

+    const int16x8_t diff = vsubq_s16(r, s);  // [-510, 510], 10 bits.

+    const int16x4_t diff_lo = vget_low_s16(diff);

+    const int16x4_t diff_hi = vget_high_s16(diff);

+    sse = vmlal_s16(sse, diff_lo, diff_lo);  // dynamic range 26 bits.

+    sse = vmlal_s16(sse, diff_hi, diff_hi);

+    total = vaddq_s16(total, diff);  // dynamic range 16 bits.

+    ref += 8;

+    src += 8;

+    width -= 8;

+  } while (width != 0);

+  {

+    // Note: 'total''s pairwise addition could be implemented similarly to

+    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired

+    // with the summation of 'sse' performed better on a Cortex-A15.

+    const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'

+    const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));

+    const int32x2_t t2 = vpadd_s32(t1, t1);

+    const int t = vget_lane_s32(t2, 0);

+    const int64x2_t s0 = vpaddlq_s32(sse);  // cascading summation of 'sse'.

+    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),

+                                  vreinterpret_s32_s64(vget_high_s64(s0)));

+    const int s = vget_lane_s32(s1, 0);

+    const int shift_factor = bwl + 2;

+    return s - ((t * t) >> shift_factor);

+  }

+}

--- /dev/null

+++ b/vpx_dsp/avg.c

@@ -1,0 +1,231 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_ports/mem.h"

+unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {

+  int i, j;

+  int sum = 0;

+  for (i = 0; i < 8; ++i, s+=p)

+    for (j = 0; j < 8; sum += s[j], ++j) {}

+  return (sum + 32) >> 6;

+}

+unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {

+  int i, j;

+  int sum = 0;

+  for (i = 0; i < 4; ++i, s+=p)

+    for (j = 0; j < 4; sum += s[j], ++j) {}

+  return (sum + 8) >> 4;

+}

+// src_diff: first pass, 9 bit, dynamic range [-255, 255]

+//           second pass, 12 bit, dynamic range [-2040, 2040]

+static void hadamard_col8(const int16_t *src_diff, int src_stride,

+                          int16_t *coeff) {

+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];

+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];

+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];

+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];

+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];

+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];

+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];

+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];

+  int16_t c0 = b0 + b2;

+  int16_t c1 = b1 + b3;

+  int16_t c2 = b0 - b2;

+  int16_t c3 = b1 - b3;

+  int16_t c4 = b4 + b6;

+  int16_t c5 = b5 + b7;

+  int16_t c6 = b4 - b6;

+  int16_t c7 = b5 - b7;

+  coeff[0] = c0 + c4;

+  coeff[7] = c1 + c5;

+  coeff[3] = c2 + c6;

+  coeff[4] = c3 + c7;

+  coeff[2] = c0 - c4;

+  coeff[6] = c1 - c5;

+  coeff[1] = c2 - c6;

+  coeff[5] = c3 - c7;

+}

+void vpx_hadamard_8x8_c(int16_t const *src_diff, int src_stride,

+                        int16_t *coeff) {

+  int idx;

+  int16_t buffer[64];

+  int16_t *tmp_buf = &buffer[0];

+  for (idx = 0; idx < 8; ++idx) {

+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit

+                                                   // dynamic range [-255, 255]

+    tmp_buf += 8;

+    ++src_diff;

+  }

+  tmp_buf = &buffer[0];

+  for (idx = 0; idx < 8; ++idx) {

+    hadamard_col8(tmp_buf, 8, coeff);  // tmp_buf: 12 bit

+                                       // dynamic range [-2040, 2040]

+    coeff += 8;  // coeff: 15 bit

+                 // dynamic range [-16320, 16320]

+    ++tmp_buf;

+  }

+}

+// In place 16x16 2D Hadamard transform

+void vpx_hadamard_16x16_c(int16_t const *src_diff, int src_stride,

+                          int16_t *coeff) {

+  int idx;

+  for (idx = 0; idx < 4; ++idx) {

+    // src_diff: 9 bit, dynamic range [-255, 255]

+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride

+                                + (idx & 0x01) * 8;

+    vpx_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);

+  }

+  // coeff: 15 bit, dynamic range [-16320, 16320]

+  for (idx = 0; idx < 64; ++idx) {

+    int16_t a0 = coeff[0];

+    int16_t a1 = coeff[64];

+    int16_t a2 = coeff[128];

+    int16_t a3 = coeff[192];

+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]

+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range

+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]

+    int16_t b3 = (a2 - a3) >> 1;

+    coeff[0]   = b0 + b2;  // 16 bit, [-32640, 32640]

+    coeff[64]  = b1 + b3;

+    coeff[128] = b0 - b2;

+    coeff[192] = b1 - b3;

+    ++coeff;

+  }

+}

+// coeff: 16 bits, dynamic range [-32640, 32640].

+// length: value range {16, 64, 256, 1024}.

+int vpx_satd_c(const int16_t *coeff, int length) {

+  int i;

+  int satd = 0;

+  for (i = 0; i < length; ++i)

+    satd += abs(coeff[i]);

+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]

+  return satd;

+}

+// Integer projection onto row vectors.

+// height: value range {16, 32, 64}.

+void vpx_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref,

+                       const int ref_stride, const int height) {

+  int idx;

+  const int norm_factor = height >> 1;

+  for (idx = 0; idx < 16; ++idx) {

+    int i;

+    hbuf[idx] = 0;

+    // hbuf[idx]: 14 bit, dynamic range [0, 16320].

+    for (i = 0; i < height; ++i)

+      hbuf[idx] += ref[i * ref_stride];

+    // hbuf[idx]: 9 bit, dynamic range [0, 510].

+    hbuf[idx] /= norm_factor;

+    ++ref;

+  }

+}

+// width: value range {16, 32, 64}.

+int16_t vpx_int_pro_col_c(uint8_t const *ref, const int width) {

+  int idx;

+  int16_t sum = 0;

+  // sum: 14 bit, dynamic range [0, 16320]

+  for (idx = 0; idx < width; ++idx)

+    sum += ref[idx];

+  return sum;

+}

+// ref: [0 - 510]

+// src: [0 - 510]

+// bwl: {2, 3, 4}

+int vpx_vector_var_c(int16_t const *ref, int16_t const *src,

+                     const int bwl) {

+  int i;

+  int width = 4 << bwl;

+  int sse = 0, mean = 0, var;

+  for (i = 0; i < width; ++i) {

+    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.

+    mean += diff;                // mean: dynamic range 16 bits.

+    sse += diff * diff;          // sse:  dynamic range 26 bits.

+  }

+  // (mean * mean): dynamic range 31 bits.

+  var = sse - ((mean * mean) >> (bwl + 2));

+  return var;

+}

+void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,

+                      int *min, int *max) {

+  int i, j;

+  *min = 255;

+  *max = 0;

+  for (i = 0; i < 8; ++i, s += p, d += dp) {

+    for (j = 0; j < 8; ++j) {

+      int diff = abs(s[j]-d[j]);

+      *min = diff < *min ? diff : *min;

+      *max = diff > *max ? diff : *max;

+    }

+  }

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {

+  int i, j;

+  int sum = 0;

+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

+  for (i = 0; i < 8; ++i, s+=p)

+    for (j = 0; j < 8; sum += s[j], ++j) {}

+  return (sum + 32) >> 6;

+}

+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {

+  int i, j;

+  int sum = 0;

+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

+  for (i = 0; i < 4; ++i, s+=p)

+    for (j = 0; j < 4; sum += s[j], ++j) {}

+  return (sum + 8) >> 4;

+}

+void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,

+                             int dp, int *min, int *max) {

+  int i, j;

+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);

+  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);

+  *min = 255;

+  *max = 0;

+  for (i = 0; i < 8; ++i, s += p, d += dp) {

+    for (j = 0; j < 8; ++j) {

+      int diff = abs(s[j]-d[j]);

+      *min = diff < *min ? diff : *min;

+      *max = diff > *max ? diff : *max;

+    }

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

--- /dev/null

+++ b/vpx_dsp/mips/avg_msa.c

@@ -1,0 +1,56 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/mips/macros_msa.h"

+uint32_t vpx_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {

+  uint32_t sum_out;

+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;

+  v4u32 sum = { 0 };

+  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);

+  HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3);

+  HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7);

+  ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6);

+  ADD2(sum0, sum2, sum4, sum6, sum0, sum4);

+  sum0 += sum4;

+  sum = __msa_hadd_u_w(sum0, sum0);

+  sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum);

+  sum = __msa_hadd_u_w(sum0, sum0);

+  sum = (v4u32)__msa_srari_w((v4i32)sum, 6);

+  sum_out = __msa_copy_u_w((v4i32)sum, 0);

+  return sum_out;

+}

+uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {

+  uint32_t sum_out;

+  uint32_t src0, src1, src2, src3;

+  v16u8 vec = { 0 };

+  v8u16 sum0;

+  v4u32 sum1;

+  v2u64 sum2;

+  LW4(src, src_stride, src0, src1, src2, src3);

+  INSERT_W4_UB(src0, src1, src2, src3, vec);

+  sum0 = __msa_hadd_u_h(vec, vec);

+  sum1 = __msa_hadd_u_w(sum0, sum0);

+  sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1);

+  sum1 = __msa_hadd_u_w(sum0, sum0);

+  sum2 = __msa_hadd_u_d(sum1, sum1);

+  sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4);

+  sum_out = __msa_copy_u_w((v4i32)sum1, 0);

+  return sum_out;

+}

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -252,6 +252,18 @@

 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm

 endif

 endif

+# avg

+DSP_SRCS-yes           += avg.c

+DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c

+DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c

+DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c

+ifeq ($(ARCH_X86_64),yes)

+ifeq ($(CONFIG_USE_X86INC),yes)

+DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm

+endif

+endif

 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER

 ifeq ($(CONFIG_ENCODERS),yes)

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -998,6 +998,35 @@

 # Avg

+if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {

+  add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";

+  specialize qw/vpx_avg_8x8 sse2 neon msa/;

+  add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";

+  specialize qw/vpx_avg_4x4 sse2 neon msa/;

+  add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";

+  specialize qw/vpx_minmax_8x8 sse2/;

+  add_proto qw/void vpx_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";

+  specialize qw/vpx_hadamard_8x8 sse2/, "$ssse3_x86_64_x86inc";

+  add_proto qw/void vpx_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";

+  specialize qw/vpx_hadamard_16x16 sse2/;

+  add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";

+  specialize qw/vpx_satd sse2 neon/;

+  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";

+  specialize qw/vpx_int_pro_row sse2 neon/;

+  add_proto qw/int16_t vpx_int_pro_col/, "uint8_t const *ref, const int width";

+  specialize qw/vpx_int_pro_col sse2 neon/;

+  add_proto qw/int vpx_vector_var/, "int16_t const *ref, int16_t const *src, const int bwl";

+  specialize qw/vpx_vector_var neon sse2/;

+}  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER

 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";

 specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";

@@ -1195,6 +1224,13 @@

   # Avg

+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";

+  specialize qw/vpx_highbd_avg_8x8/;

+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";

+  specialize qw/vpx_highbd_avg_4x4/;

+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";

+  specialize qw/vpx_highbd_minmax_8x8/;

   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";

   specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";

--- /dev/null

+++ b/vpx_dsp/x86/avg_intrin_sse2.c

@@ -1,0 +1,423 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <emmintrin.h>

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_ports/mem.h"

+void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,

+                         int *min, int *max) {

+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;

+  u0  = _mm_setzero_si128();

+  // Row 0

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff0 = _mm_max_epi16(diff, negdiff);

+  // Row 1

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff = _mm_max_epi16(diff, negdiff);

+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);

+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);

+  // Row 2

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff = _mm_max_epi16(diff, negdiff);

+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

+  // Row 3

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff = _mm_max_epi16(diff, negdiff);

+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

+  // Row 4

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff = _mm_max_epi16(diff, negdiff);

+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

+  // Row 5

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff = _mm_max_epi16(diff, negdiff);

+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

+  // Row 6

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff = _mm_max_epi16(diff, negdiff);

+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

+  // Row 7

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);

+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);

+  diff = _mm_subs_epi16(s0, d0);

+  negdiff = _mm_subs_epi16(u0, diff);

+  absdiff = _mm_max_epi16(diff, negdiff);

+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);

+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);

+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));

+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));

+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));

+  *max = _mm_extract_epi16(maxabsdiff, 0);

+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));

+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));

+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));

+  *min = _mm_extract_epi16(minabsdiff, 0);

+}

+unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {

+  __m128i s0, s1, u0;

+  unsigned int avg = 0;

+  u0  = _mm_setzero_si128();

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));

+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));

+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));

+  avg = _mm_extract_epi16(s0, 0);

+  return (avg + 32) >> 6;

+}

+unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {

+  __m128i s0, s1, u0;

+  unsigned int avg = 0;

+  u0  = _mm_setzero_si128();

+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);

+  s0 = _mm_adds_epu16(s0, s1);

+  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));

+  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));

+  avg = _mm_extract_epi16(s0, 0);

+  return (avg + 8) >> 4;

+}

+static void hadamard_col8_sse2(__m128i *in, int iter) {

+  __m128i a0 = in[0];

+  __m128i a1 = in[1];

+  __m128i a2 = in[2];

+  __m128i a3 = in[3];

+  __m128i a4 = in[4];

+  __m128i a5 = in[5];

+  __m128i a6 = in[6];

+  __m128i a7 = in[7];

+  __m128i b0 = _mm_add_epi16(a0, a1);

+  __m128i b1 = _mm_sub_epi16(a0, a1);

+  __m128i b2 = _mm_add_epi16(a2, a3);

+  __m128i b3 = _mm_sub_epi16(a2, a3);

+  __m128i b4 = _mm_add_epi16(a4, a5);

+  __m128i b5 = _mm_sub_epi16(a4, a5);

+  __m128i b6 = _mm_add_epi16(a6, a7);

+  __m128i b7 = _mm_sub_epi16(a6, a7);

+  a0 = _mm_add_epi16(b0, b2);

+  a1 = _mm_add_epi16(b1, b3);

+  a2 = _mm_sub_epi16(b0, b2);

+  a3 = _mm_sub_epi16(b1, b3);

+  a4 = _mm_add_epi16(b4, b6);

+  a5 = _mm_add_epi16(b5, b7);

+  a6 = _mm_sub_epi16(b4, b6);

+  a7 = _mm_sub_epi16(b5, b7);

+  if (iter == 0) {

+    b0 = _mm_add_epi16(a0, a4);

+    b7 = _mm_add_epi16(a1, a5);

+    b3 = _mm_add_epi16(a2, a6);

+    b4 = _mm_add_epi16(a3, a7);

+    b2 = _mm_sub_epi16(a0, a4);

+    b6 = _mm_sub_epi16(a1, a5);

+    b1 = _mm_sub_epi16(a2, a6);

+    b5 = _mm_sub_epi16(a3, a7);

+    a0 = _mm_unpacklo_epi16(b0, b1);

+    a1 = _mm_unpacklo_epi16(b2, b3);

+    a2 = _mm_unpackhi_epi16(b0, b1);

+    a3 = _mm_unpackhi_epi16(b2, b3);

+    a4 = _mm_unpacklo_epi16(b4, b5);

+    a5 = _mm_unpacklo_epi16(b6, b7);

+    a6 = _mm_unpackhi_epi16(b4, b5);

+    a7 = _mm_unpackhi_epi16(b6, b7);

+    b0 = _mm_unpacklo_epi32(a0, a1);

+    b1 = _mm_unpacklo_epi32(a4, a5);

+    b2 = _mm_unpackhi_epi32(a0, a1);

+    b3 = _mm_unpackhi_epi32(a4, a5);

+    b4 = _mm_unpacklo_epi32(a2, a3);

+    b5 = _mm_unpacklo_epi32(a6, a7);

+    b6 = _mm_unpackhi_epi32(a2, a3);

+    b7 = _mm_unpackhi_epi32(a6, a7);

+    in[0] = _mm_unpacklo_epi64(b0, b1);

+    in[1] = _mm_unpackhi_epi64(b0, b1);

+    in[2] = _mm_unpacklo_epi64(b2, b3);

+    in[3] = _mm_unpackhi_epi64(b2, b3);

+    in[4] = _mm_unpacklo_epi64(b4, b5);

+    in[5] = _mm_unpackhi_epi64(b4, b5);

+    in[6] = _mm_unpacklo_epi64(b6, b7);

+    in[7] = _mm_unpackhi_epi64(b6, b7);

+  } else {

+    in[0] = _mm_add_epi16(a0, a4);

+    in[7] = _mm_add_epi16(a1, a5);

+    in[3] = _mm_add_epi16(a2, a6);

+    in[4] = _mm_add_epi16(a3, a7);

+    in[2] = _mm_sub_epi16(a0, a4);

+    in[6] = _mm_sub_epi16(a1, a5);

+    in[1] = _mm_sub_epi16(a2, a6);

+    in[5] = _mm_sub_epi16(a3, a7);

+  }

+}

+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,

+                           int16_t *coeff) {

+  __m128i src[8];

+  src[0] = _mm_load_si128((const __m128i *)src_diff);

+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));

+  hadamard_col8_sse2(src, 0);

+  hadamard_col8_sse2(src, 1);

+  _mm_store_si128((__m128i *)coeff, src[0]);

+  coeff += 8;

+  _mm_store_si128((__m128i *)coeff, src[1]);

+  coeff += 8;

+  _mm_store_si128((__m128i *)coeff, src[2]);

+  coeff += 8;

+  _mm_store_si128((__m128i *)coeff, src[3]);

+  coeff += 8;

+  _mm_store_si128((__m128i *)coeff, src[4]);

+  coeff += 8;

+  _mm_store_si128((__m128i *)coeff, src[5]);

+  coeff += 8;

+  _mm_store_si128((__m128i *)coeff, src[6]);

+  coeff += 8;

+  _mm_store_si128((__m128i *)coeff, src[7]);

+}

+void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,

+                             int16_t *coeff) {

+  int idx;

+  for (idx = 0; idx < 4; ++idx) {

+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride

+                                + (idx & 0x01) * 8;

+    vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);

+  }

+  for (idx = 0; idx < 64; idx += 8) {

+    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);

+    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));

+    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));

+    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));

+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);

+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);

+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);

+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

+    b0 = _mm_srai_epi16(b0, 1);

+    b1 = _mm_srai_epi16(b1, 1);

+    b2 = _mm_srai_epi16(b2, 1);

+    b3 = _mm_srai_epi16(b3, 1);

+    coeff0 = _mm_add_epi16(b0, b2);

+    coeff1 = _mm_add_epi16(b1, b3);

+    _mm_store_si128((__m128i *)coeff, coeff0);

+    _mm_store_si128((__m128i *)(coeff + 64), coeff1);

+    coeff2 = _mm_sub_epi16(b0, b2);

+    coeff3 = _mm_sub_epi16(b1, b3);

+    _mm_store_si128((__m128i *)(coeff + 128), coeff2);

+    _mm_store_si128((__m128i *)(coeff + 192), coeff3);

+    coeff += 8;

+  }

+}

+int vpx_satd_sse2(const int16_t *coeff, int length) {

+  int i;

+  const __m128i zero = _mm_setzero_si128();

+  __m128i accum = zero;

+  for (i = 0; i < length; i += 8) {

+    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);

+    const __m128i inv = _mm_sub_epi16(zero, src_line);

+    const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)

+    const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);

+    const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);

+    const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);

+    accum = _mm_add_epi32(accum, sum);

+    coeff += 8;

+  }

+  {  // cascading summation of accum

+    __m128i hi = _mm_srli_si128(accum, 8);

+    accum = _mm_add_epi32(accum, hi);

+    hi = _mm_srli_epi64(accum, 32);

+    accum = _mm_add_epi32(accum, hi);

+  }

+  return _mm_cvtsi128_si32(accum);

+}

+void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,

+                          const int ref_stride, const int height) {

+  int idx;

+  __m128i zero = _mm_setzero_si128();

+  __m128i src_line = _mm_loadu_si128((const __m128i *)ref);

+  __m128i s0 = _mm_unpacklo_epi8(src_line, zero);

+  __m128i s1 = _mm_unpackhi_epi8(src_line, zero);

+  __m128i t0, t1;

+  int height_1 = height - 1;

+  ref += ref_stride;

+  for (idx = 1; idx < height_1; idx += 2) {

+    src_line = _mm_loadu_si128((const __m128i *)ref);

+    t0 = _mm_unpacklo_epi8(src_line, zero);

+    t1 = _mm_unpackhi_epi8(src_line, zero);

+    s0 = _mm_adds_epu16(s0, t0);

+    s1 = _mm_adds_epu16(s1, t1);

+    ref += ref_stride;

+    src_line = _mm_loadu_si128((const __m128i *)ref);

+    t0 = _mm_unpacklo_epi8(src_line, zero);

+    t1 = _mm_unpackhi_epi8(src_line, zero);

+    s0 = _mm_adds_epu16(s0, t0);

+    s1 = _mm_adds_epu16(s1, t1);

+    ref += ref_stride;

+  }

+  src_line = _mm_loadu_si128((const __m128i *)ref);

+  t0 = _mm_unpacklo_epi8(src_line, zero);

+  t1 = _mm_unpackhi_epi8(src_line, zero);

+  s0 = _mm_adds_epu16(s0, t0);

+  s1 = _mm_adds_epu16(s1, t1);

+  if (height == 64) {

+    s0 = _mm_srai_epi16(s0, 5);

+    s1 = _mm_srai_epi16(s1, 5);

+  } else if (height == 32) {

+    s0 = _mm_srai_epi16(s0, 4);

+    s1 = _mm_srai_epi16(s1, 4);

+  } else {

+    s0 = _mm_srai_epi16(s0, 3);

+    s1 = _mm_srai_epi16(s1, 3);

+  }

+  _mm_storeu_si128((__m128i *)hbuf, s0);

+  hbuf += 8;

+  _mm_storeu_si128((__m128i *)hbuf, s1);

+}

+int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {

+  __m128i zero = _mm_setzero_si128();

+  __m128i src_line = _mm_load_si128((const __m128i *)ref);

+  __m128i s0 = _mm_sad_epu8(src_line, zero);

+  __m128i s1;

+  int i;

+  for (i = 16; i < width; i += 16) {

+    ref += 16;

+    src_line = _mm_load_si128((const __m128i *)ref);

+    s1 = _mm_sad_epu8(src_line, zero);

+    s0 = _mm_adds_epu16(s0, s1);

+  }

+  s1 = _mm_srli_si128(s0, 8);

+  s0 = _mm_adds_epu16(s0, s1);

+  return _mm_extract_epi16(s0, 0);

+}

+int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src,

+                        const int bwl) {

+  int idx;

+  int width = 4 << bwl;

+  int16_t mean;

+  __m128i v0 = _mm_loadu_si128((const __m128i *)ref);

+  __m128i v1 = _mm_load_si128((const __m128i *)src);

+  __m128i diff = _mm_subs_epi16(v0, v1);

+  __m128i sum = diff;

+  __m128i sse = _mm_madd_epi16(diff, diff);

+  ref += 8;

+  src += 8;

+  for (idx = 8; idx < width; idx += 8) {

+    v0 = _mm_loadu_si128((const __m128i *)ref);

+    v1 = _mm_load_si128((const __m128i *)src);

+    diff = _mm_subs_epi16(v0, v1);

+    sum = _mm_add_epi16(sum, diff);

+    v0  = _mm_madd_epi16(diff, diff);

+    sse = _mm_add_epi32(sse, v0);

+    ref += 8;

+    src += 8;

+  }

+  v0  = _mm_srli_si128(sum, 8);

+  sum = _mm_add_epi16(sum, v0);

+  v0  = _mm_srli_epi64(sum, 32);

+  sum = _mm_add_epi16(sum, v0);

+  v0  = _mm_srli_epi32(sum, 16);

+  sum = _mm_add_epi16(sum, v0);

+  v1  = _mm_srli_si128(sse, 8);

+  sse = _mm_add_epi32(sse, v1);

+  v1  = _mm_srli_epi64(sse, 32);

+  sse = _mm_add_epi32(sse, v1);

+  mean = _mm_extract_epi16(sum, 0);

+  return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));

+}

--- /dev/null

+++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm

@@ -1,0 +1,121 @@

+;

+;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%define private_prefix vpx

+%include "third_party/x86inc/x86inc.asm"

+; This file provides SSSE3 version of the hadamard transformation. Part

+; of the macro definitions are originally derived from the ffmpeg project.

+; The current version applies to x86 64-bit only.

+SECTION .text

+%if ARCH_X86_64

+; matrix transpose

+%macro INTERLEAVE_2X 4

+  punpckh%1          m%4, m%2, m%3

+  punpckl%1          m%2, m%3

+  SWAP               %3,  %4

+%endmacro

+%macro TRANSPOSE8X8 9

+  INTERLEAVE_2X  wd, %1, %2, %9

+  INTERLEAVE_2X  wd, %3, %4, %9

+  INTERLEAVE_2X  wd, %5, %6, %9

+  INTERLEAVE_2X  wd, %7, %8, %9

+  INTERLEAVE_2X  dq, %1, %3, %9

+  INTERLEAVE_2X  dq, %2, %4, %9

+  INTERLEAVE_2X  dq, %5, %7, %9

+  INTERLEAVE_2X  dq, %6, %8, %9

+  INTERLEAVE_2X  qdq, %1, %5, %9

+  INTERLEAVE_2X  qdq, %3, %7, %9

+  INTERLEAVE_2X  qdq, %2, %6, %9

+  INTERLEAVE_2X  qdq, %4, %8, %9

+  SWAP  %2, %5

+  SWAP  %4, %7

+%endmacro

+%macro HMD8_1D 0

+  psubw              m8, m0, m1

+  psubw              m9, m2, m3

+  paddw              m0, m1

+  paddw              m2, m3

+  SWAP               1, 8

+  SWAP               3, 9

+  psubw              m8, m4, m5

+  psubw              m9, m6, m7

+  paddw              m4, m5

+  paddw              m6, m7

+  SWAP               5, 8

+  SWAP               7, 9

+  psubw              m8, m0, m2

+  psubw              m9, m1, m3

+  paddw              m0, m2

+  paddw              m1, m3

+  SWAP               2, 8

+  SWAP               3, 9

+  psubw              m8, m4, m6

+  psubw              m9, m5, m7

+  paddw              m4, m6

+  paddw              m5, m7

+  SWAP               6, 8

+  SWAP               7, 9

+  psubw              m8, m0, m4

+  psubw              m9, m1, m5

+  paddw              m0, m4

+  paddw              m1, m5

+  SWAP               4, 8

+  SWAP               5, 9

+  psubw              m8, m2, m6

+  psubw              m9, m3, m7

+  paddw              m2, m6

+  paddw              m3, m7

+  SWAP               6, 8

+  SWAP               7, 9

+%endmacro

+INIT_XMM ssse3

+cglobal hadamard_8x8, 3, 5, 10, input, stride, output

+  lea                r3, [2 * strideq]

+  lea                r4, [4 * strideq]

+  mova               m0, [inputq]

+  mova               m1, [inputq + r3]

+  lea                inputq, [inputq + r4]

+  mova               m2, [inputq]

+  mova               m3, [inputq + r3]

+  lea                inputq, [inputq + r4]

+  mova               m4, [inputq]

+  mova               m5, [inputq + r3]

+  lea                inputq, [inputq + r4]

+  mova               m6, [inputq]

+  mova               m7, [inputq + r3]

+  HMD8_1D

+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9

+  HMD8_1D

+  mova              [outputq +   0], m0

+  mova              [outputq +  16], m1

+  mova              [outputq +  32], m2

+  mova              [outputq +  48], m3

+  mova              [outputq +  64], m4

+  mova              [outputq +  80], m5

+  mova              [outputq +  96], m6

+  mova              [outputq + 112], m7

+  RET

+%endif

--

⑨