shithub: libvpx

--- /dev/null

+++ b/test/convolve_test.cc

@@ -1,0 +1,527 @@

+/*

+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+extern "C" {

+#include "./vpx_config.h"

+#include "./vp9_rtcd.h"

+#include "vpx_mem/vpx_mem.h"

+}

+#include "third_party/googletest/src/include/gtest/gtest.h"

+#include "test/acm_random.h"

+#include "test/register_state_check.h"

+#include "test/util.h"

+namespace {

+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int filter_x_stride,

+                              const int16_t *filter_y, int filter_y_stride,

+                              int w, int h);

+struct ConvolveFunctions {

+  ConvolveFunctions(convolve_fn_t h8, convolve_fn_t h8_avg,

+                    convolve_fn_t v8, convolve_fn_t v8_avg,

+                    convolve_fn_t hv8, convolve_fn_t hv8_avg)

+      : h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg), v8_avg_(v8_avg),

+        hv8_avg_(hv8_avg) {}

+  convolve_fn_t h8_;

+  convolve_fn_t v8_;

+  convolve_fn_t hv8_;

+  convolve_fn_t h8_avg_;

+  convolve_fn_t v8_avg_;

+  convolve_fn_t hv8_avg_;

+};

+// Reference 8-tap subpixel filter, slightly modified to fit into this test.

+#define VP9_FILTER_WEIGHT 128

+#define VP9_FILTER_SHIFT 7

+static uint8_t clip_pixel(int x) {

+  return x < 0 ? 0 :

+         x > 255 ? 255 :

+         x;

+}

+static void filter_block2d_8_c(const uint8_t *src_ptr,

+                               const unsigned int src_stride,

+                               const int16_t *HFilter,

+                               const int16_t *VFilter,

+                               uint8_t *dst_ptr,

+                               unsigned int dst_stride,

+                               unsigned int output_width,

+                               unsigned int output_height) {

+  // Between passes, we use an intermediate buffer whose height is extended to

+  // have enough horizontally filtered values as input for the vertical pass.

+  // This buffer is allocated to be big enough for the largest block type we

+  // support.

+  const int kInterp_Extend = 4;

+  const unsigned int intermediate_height =

+    (kInterp_Extend - 1) +     output_height + kInterp_Extend;

+  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,

+   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height

+   *                                 + kInterp_Extend

+   *                               = 3 + 16 + 4

+   *                               = 23

+   * and filter_max_width = 16

+   */

+  uint8_t intermediate_buffer[23 * 16];

+  const int intermediate_next_stride = 1 - intermediate_height * output_width;

+  // Horizontal pass (src -> transposed intermediate).

+  {

+    uint8_t *output_ptr = intermediate_buffer;

+    const int src_next_row_stride = src_stride - output_width;

+    unsigned int i, j;

+    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

+    for (i = 0; i < intermediate_height; ++i) {

+      for (j = 0; j < output_width; ++j) {

+        // Apply filter...

+        int temp = ((int)src_ptr[0] * HFilter[0]) +

+                   ((int)src_ptr[1] * HFilter[1]) +

+                   ((int)src_ptr[2] * HFilter[2]) +

+                   ((int)src_ptr[3] * HFilter[3]) +

+                   ((int)src_ptr[4] * HFilter[4]) +

+                   ((int)src_ptr[5] * HFilter[5]) +

+                   ((int)src_ptr[6] * HFilter[6]) +

+                   ((int)src_ptr[7] * HFilter[7]) +

+                   (VP9_FILTER_WEIGHT >> 1);  // Rounding

+        // Normalize back to 0-255...

+        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);

+        ++src_ptr;

+        output_ptr += intermediate_height;

+      }

+      src_ptr += src_next_row_stride;

+      output_ptr += intermediate_next_stride;

+    }

+  }

+  // Vertical pass (transposed intermediate -> dst).

+  {

+    uint8_t *src_ptr = intermediate_buffer;

+    const int dst_next_row_stride = dst_stride - output_width;

+    unsigned int i, j;

+    for (i = 0; i < output_height; ++i) {

+      for (j = 0; j < output_width; ++j) {

+        // Apply filter...

+        int temp = ((int)src_ptr[0] * VFilter[0]) +

+                   ((int)src_ptr[1] * VFilter[1]) +

+                   ((int)src_ptr[2] * VFilter[2]) +

+                   ((int)src_ptr[3] * VFilter[3]) +

+                   ((int)src_ptr[4] * VFilter[4]) +

+                   ((int)src_ptr[5] * VFilter[5]) +

+                   ((int)src_ptr[6] * VFilter[6]) +

+                   ((int)src_ptr[7] * VFilter[7]) +

+                   (VP9_FILTER_WEIGHT >> 1);  // Rounding

+        // Normalize back to 0-255...

+        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);

+        src_ptr += intermediate_height;

+      }

+      src_ptr += intermediate_next_stride;

+      dst_ptr += dst_next_row_stride;

+    }

+  }

+}

+static void block2d_average_c(uint8_t *src,

+                              unsigned int src_stride,

+                              uint8_t *output_ptr,

+                              unsigned int output_stride,

+                              unsigned int output_width,

+                              unsigned int output_height) {

+  unsigned int i, j;

+  for (i = 0; i < output_height; ++i) {

+    for (j = 0; j < output_width; ++j) {

+      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;

+    }

+    output_ptr += output_stride;

+  }

+}

+static void filter_average_block2d_8_c(const uint8_t *src_ptr,

+                                       const unsigned int src_stride,

+                                       const int16_t *HFilter,

+                                       const int16_t *VFilter,

+                                       uint8_t *dst_ptr,

+                                       unsigned int dst_stride,

+                                       unsigned int output_width,

+                                       unsigned int output_height) {

+  uint8_t tmp[16*16];

+  assert(output_width <= 16);

+  assert(output_height <= 16);

+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 16,

+                     output_width, output_height);

+  block2d_average_c(tmp, 16, dst_ptr, dst_stride,

+                    output_width, output_height);

+}

+class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {

+ public:

+  static void SetUpTestCase() {

+    // Force input_ to be unaligned, output to be 16 byte aligned.

+    input_ = reinterpret_cast<uint8_t*>(

+        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize + 1))

+        + 1;

+    output_ = reinterpret_cast<uint8_t*>(

+        vpx_memalign(kDataAlignment, kOuterBlockSize * kOuterBlockSize));

+  }

+  static void TearDownTestCase() {

+    vpx_free(input_ - 1);

+    input_ = NULL;

+    vpx_free(output_);

+    output_ = NULL;

+  }

+  protected:

+    static const int kDataAlignment = 16;

+    static const int kOuterBlockSize = 32;

+    static const int kInputStride = kOuterBlockSize;

+    static const int kOutputStride = kOuterBlockSize;

+    static const int kMaxDimension = 16;

+    int Width() const { return GET_PARAM(0); }

+    int Height() const { return GET_PARAM(1); }

+    int BorderLeft() const {

+      const int center = (kOuterBlockSize - Width()) / 2;

+      return (center + (kDataAlignment - 1)) & ~(kDataAlignment - 1);

+    }

+    int BorderTop() const { return (kOuterBlockSize - Height()) / 2; }

+    bool IsIndexInBorder(int i) {

+      return (i < BorderTop() * kOuterBlockSize ||

+              i >= (BorderTop() + Height()) * kOuterBlockSize ||

+              i % kOuterBlockSize < BorderLeft() ||

+              i % kOuterBlockSize >= (BorderLeft() + Width()));

+    }

+    virtual void SetUp() {

+      UUT_ = GET_PARAM(2);

+      memset(input_, 0, sizeof(input_));

+      /* Set up guard blocks for an inner block cetered in the outer block */

+      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {

+        if (IsIndexInBorder(i))

+          output_[i] = 255;

+        else

+          output_[i] = 0;

+      }

+      ::libvpx_test::ACMRandom prng;

+      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i)

+        input_[i] = prng.Rand8();

+    }

+    void CheckGuardBlocks() {

+      for (int i = 0; i < kOuterBlockSize * kOuterBlockSize; ++i) {

+        if (IsIndexInBorder(i))

+          EXPECT_EQ(255, output_[i]);

+      }

+    }

+    uint8_t* input() {

+      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();

+    }

+    uint8_t* output() {

+      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();

+    }

+    const ConvolveFunctions* UUT_;

+    static uint8_t* input_;

+    static uint8_t* output_;

+};

+uint8_t* ConvolveTest::input_ = NULL;

+uint8_t* ConvolveTest::output_ = NULL;

+TEST_P(ConvolveTest, GuardBlocks) {

+  CheckGuardBlocks();

+}

+TEST_P(ConvolveTest, CopyHoriz) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};

+  REGISTER_STATE_CHECK(

+      UUT_->h8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,

+                Width(), Height()));

+  CheckGuardBlocks();

+  for (int y = 0; y < Height(); ++y)

+    for (int x = 0; x < Width(); ++x)

+      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])

+          << "(" << x << "," << y << ")";

+}

+TEST_P(ConvolveTest, CopyVert) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};

+  REGISTER_STATE_CHECK(

+      UUT_->v8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,

+                Width(), Height()));

+  CheckGuardBlocks();

+  for (int y = 0; y < Height(); ++y)

+    for (int x = 0; x < Width(); ++x)

+      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])

+          << "(" << x << "," << y << ")";

+}

+TEST_P(ConvolveTest, Copy2D) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  const int16_t filter8[8] = {0, 0, 0, 128, 0, 0, 0, 0};

+  REGISTER_STATE_CHECK(

+      UUT_->hv8_(in, kInputStride, out, kOutputStride, filter8, 16, filter8, 16,

+                 Width(), Height()));

+  CheckGuardBlocks();

+  for (int y = 0; y < Height(); ++y)

+    for (int x = 0; x < Width(); ++x)

+      ASSERT_EQ(out[y * kOutputStride + x], in[y * kInputStride + x])

+          << "(" << x << "," << y << ")";

+}

+TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  uint8_t ref[kOutputStride * kMaxDimension];

+  const int16_t filters[][8] = {

+    { 0,   0,   0, 128,   0,   0,   0,  0},

+    { 0,   1,  -5, 126,   8,  -3,   1,  0},

+    { -1,   3, -10, 122,  18,  -6,   2,  0},

+    { -1,   4, -13, 118,  27,  -9,   3, -1},

+    { -1,   4, -16, 112,  37, -11,   4, -1},

+    { -1,   5, -18, 105,  48, -14,   4, -1},

+    { -1,   5, -19,  97,  58, -16,   5, -1},

+    { -1,   6, -19,  88,  68, -18,   5, -1},

+    { -1,   6, -19,  78,  78, -19,   6, -1},

+    { -1,   5, -18,  68,  88, -19,   6, -1},

+    { -1,   5, -16,  58,  97, -19,   5, -1},

+    { -1,   4, -14,  48, 105, -18,   5, -1},

+    { -1,   4, -11,  37, 112, -16,   4, -1},

+    { -1,   3,  -9,  27, 118, -13,   4, -1},

+    { 0,   2,  -6,  18, 122, -10,   3, -1},

+    { 0,   1,  -3,   8, 126,  -5,   1,  0}

+  };

+  const int kNumFilters = sizeof(filters) / sizeof(filters[0]);

+  for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {

+    for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {

+      filter_block2d_8_c(in, kInputStride,

+                         filters[filter_x], filters[filter_y],

+                         ref, kOutputStride,

+                         Width(), Height());

+      if (filter_x && filter_y)

+        REGISTER_STATE_CHECK(

+            UUT_->hv8_(in, kInputStride, out, kOutputStride,

+                       filters[filter_x], 16, filters[filter_y], 16,

+                       Width(), Height()));

+      else if (filter_y)

+        REGISTER_STATE_CHECK(

+            UUT_->v8_(in, kInputStride, out, kOutputStride,

+                      filters[filter_x], 16, filters[filter_y], 16,

+                      Width(), Height()));

+      else

+        REGISTER_STATE_CHECK(

+            UUT_->h8_(in, kInputStride, out, kOutputStride,

+                      filters[filter_x], 16, filters[filter_y], 16,

+                      Width(), Height()));

+      CheckGuardBlocks();

+      for (int y = 0; y < Height(); ++y)

+        for (int x = 0; x < Width(); ++x)

+          ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])

+              << "mismatch at (" << x << "," << y << "), "

+              << "filters (" << filter_x << "," << filter_y << ")";

+    }

+  }

+}

+TEST_P(ConvolveTest, MatchesReferenceAveragingSubpixelFilter) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  uint8_t ref[kOutputStride * kMaxDimension];

+  // Populate ref and out with some random data

+  ::libvpx_test::ACMRandom prng;

+  for (int y = 0; y < Height(); ++y) {

+    for (int x = 0; x < Width(); ++x) {

+      const uint8_t r = prng.Rand8();

+      out[y * kOutputStride + x] = r;

+      ref[y * kOutputStride + x] = r;

+    }

+  }

+  const int16_t filters[][8] = {

+    { 0,   0,   0, 128,   0,   0,   0,  0},

+    { 0,   1,  -5, 126,   8,  -3,   1,  0},

+    { -1,   3, -10, 122,  18,  -6,   2,  0},

+    { -1,   4, -13, 118,  27,  -9,   3, -1},

+    { -1,   4, -16, 112,  37, -11,   4, -1},

+    { -1,   5, -18, 105,  48, -14,   4, -1},

+    { -1,   5, -19,  97,  58, -16,   5, -1},

+    { -1,   6, -19,  88,  68, -18,   5, -1},

+    { -1,   6, -19,  78,  78, -19,   6, -1},

+    { -1,   5, -18,  68,  88, -19,   6, -1},

+    { -1,   5, -16,  58,  97, -19,   5, -1},

+    { -1,   4, -14,  48, 105, -18,   5, -1},

+    { -1,   4, -11,  37, 112, -16,   4, -1},

+    { -1,   3,  -9,  27, 118, -13,   4, -1},

+    { 0,   2,  -6,  18, 122, -10,   3, -1},

+    { 0,   1,  -3,   8, 126,  -5,   1,  0}

+  };

+  const int kNumFilters = sizeof(filters) / sizeof(filters[0]);

+  for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {

+    for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {

+      filter_average_block2d_8_c(in, kInputStride,

+                                 filters[filter_x], filters[filter_y],

+                                 ref, kOutputStride,

+                                 Width(), Height());

+      if (filter_x && filter_y)

+        REGISTER_STATE_CHECK(

+            UUT_->hv8_avg_(in, kInputStride, out, kOutputStride,

+                           filters[filter_x], 16, filters[filter_y], 16,

+                           Width(), Height()));

+      else if (filter_y)

+        REGISTER_STATE_CHECK(

+            UUT_->v8_avg_(in, kInputStride, out, kOutputStride,

+                          filters[filter_x], 16, filters[filter_y], 16,

+                          Width(), Height()));

+      else

+        REGISTER_STATE_CHECK(

+            UUT_->h8_avg_(in, kInputStride, out, kOutputStride,

+                          filters[filter_x], 16, filters[filter_y], 16,

+                          Width(), Height()));

+      CheckGuardBlocks();

+      for (int y = 0; y < Height(); ++y)

+        for (int x = 0; x < Width(); ++x)

+          ASSERT_EQ(ref[y * kOutputStride + x], out[y * kOutputStride + x])

+              << "mismatch at (" << x << "," << y << "), "

+              << "filters (" << filter_x << "," << filter_y << ")";

+    }

+  }

+}

+TEST_P(ConvolveTest, ChangeFilterWorks) {

+  uint8_t* const in = input();

+  uint8_t* const out = output();

+  const int16_t filters[][8] = {

+    { 0,   0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0, 128},

+    { 0,   0,   0, 128},

+    { 0,   0, 128},

+    { 0, 128},

+    { 128},

+    { 0,   0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0, 128},

+    { 0,   0,   0, 128},

+    { 0,   0, 128},

+    { 0, 128},

+    { 128},

+    { 0,   0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0,   0, 128},

+    { 0,   0,   0,   0, 128},

+    { 0,   0,   0, 128},

+    { 0,   0, 128},

+    { 0, 128},

+    { 128},

+  };

+  REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,

+                                 filters[0], 17, filters[4], 16,

+                                 Width(), Height()));

+  for (int x = 0; x < (Width() > 4 ? 8 : 4); ++x) {

+    ASSERT_EQ(in[4], out[x]) << "x == " << x;

+  }

+  REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,

+                                 filters[4], 16, filters[0], 17,

+                                 Width(), Height()));

+  for (int y = 0; y < (Height() > 4 ? 8 : 4); ++y) {

+    ASSERT_EQ(in[4 * kInputStride], out[y * kOutputStride]) << "y == " << y;

+  }

+  REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,

+                                  filters[0], 17, filters[0], 17,

+                                  Width(), Height()));

+  for (int y = 0; y < (Height() > 4 ? 8 : 4); ++y) {

+    for (int x = 0; x < (Width() > 4 ? 8 : 4); ++x) {

+      ASSERT_EQ(in[4 * kInputStride + 4], out[y * kOutputStride + x])

+          << "x == " << x << ", y == " << y;

+    }

+  }

+}

+using std::tr1::make_tuple;

+const ConvolveFunctions convolve8_2d_only_c(

+    vp9_convolve8_c, vp9_convolve8_avg_c,

+    vp9_convolve8_c, vp9_convolve8_avg_c,

+    vp9_convolve8_c, vp9_convolve8_avg_c);

+const ConvolveFunctions convolve8_c(

+    vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,

+    vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,

+    vp9_convolve8_c, vp9_convolve8_avg_c);

+INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(

+    make_tuple(4, 4, &convolve8_2d_only_c),

+    make_tuple(8, 4, &convolve8_2d_only_c),

+    make_tuple(8, 8, &convolve8_2d_only_c),

+    make_tuple(16, 16, &convolve8_2d_only_c),

+    make_tuple(4, 4, &convolve8_c),

+    make_tuple(8, 4, &convolve8_c),

+    make_tuple(8, 8, &convolve8_c),

+    make_tuple(16, 16, &convolve8_c)));

+}

+#if HAVE_SSSE3

+const ConvolveFunctions convolve8_ssse3(

+    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,

+    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,

+    vp9_convolve8_ssse3, vp9_convolve8_avg_c);

+INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(

+    make_tuple(4, 4, &convolve8_ssse3),

+    make_tuple(8, 4, &convolve8_ssse3),

+    make_tuple(8, 8, &convolve8_ssse3),

+    make_tuple(16, 16, &convolve8_ssse3)));

+#endif

--- a/test/test.mk

+++ b/test/test.mk

@@ -70,6 +70,7 @@

 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc

 endif

+LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc

 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc

--- a/vp9/common/generic/vp9_systemdependent.c

+++ b/vp9/common/generic/vp9_systemdependent.c

@@ -11,8 +11,6 @@

 #include "./vpx_config.h"

 #include "vp9_rtcd.h"

-#include "vp9/common/vp9_subpixel.h"

-#include "vp9/common/vp9_loopfilter.h"

 #include "vp9/common/vp9_onyxc_int.h"

 void vp9_machine_specific_config(VP9_COMMON *ctx) {

--- a/vp9/common/ppc/vp9_systemdependent.c

+++ b/vp9/common/ppc/vp9_systemdependent.c

@@ -8,7 +8,6 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

-#include "vp9/common/vp9_subpixel.h"

 #include "vp9/common/vp9_loopfilter.h"

 #include "recon.h"

 #include "vp9/common/vp9_onyxc_int.h"

--- a/vp9/common/vp9_blockd.h

+++ b/vp9/common/vp9_blockd.h

@@ -16,9 +16,9 @@

 #include "./vpx_config.h"

 #include "vpx_scale/yv12config.h"

+#include "vp9/common/vp9_convolve.h"

 #include "vp9/common/vp9_mv.h"

 #include "vp9/common/vp9_treecoder.h"

-#include "vp9/common/vp9_subpixel.h"

 #include "vpx_ports/mem.h"

 #include "vp9/common/vp9_common.h"

@@ -394,15 +394,8 @@

   void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);

   void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);

+  struct subpix_fn_table  subpix;

-  vp9_subpix_fn_t  subpixel_predict4x4;

-  vp9_subpix_fn_t  subpixel_predict8x4;

-  vp9_subpix_fn_t  subpixel_predict8x8;

-  vp9_subpix_fn_t  subpixel_predict16x16;

-  vp9_subpix_fn_t  subpixel_predict_avg4x4;

-  vp9_subpix_fn_t  subpixel_predict_avg8x4;

-  vp9_subpix_fn_t  subpixel_predict_avg8x8;

-  vp9_subpix_fn_t  subpixel_predict_avg16x16;

   int allow_high_precision_mv;

   int corrupted;

--- /dev/null

+++ b/vp9/common/vp9_convolve.c

@@ -1,0 +1,345 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <assert.h>

+#include "./vpx_config.h"

+#include "./vp9_rtcd.h"

+#include "vp9/common/vp9_common.h"

+#include "vpx/vpx_integer.h"

+#define VP9_FILTER_WEIGHT 128

+#define VP9_FILTER_SHIFT  7

+#define ALIGN_FILTERS_256 0

+/* Assume a bank of 16 filters to choose from. There are two implementations

+ * for filter wrapping behavior, since we want to be able to pick which filter

+ * to start with. We could either:

+ *

+ * 1) make filter_ a pointer to the base of the filter array, and then add an

+ *    additional offset parameter, to choose the starting filter.

+ * 2) use a pointer to 2 periods worth of filters, so that even if the original

+ *    phase offset is at 15/16, we'll have valid data to read. The filter

+ *    tables become [32][8], and the second half is duplicated.

+ * 3) fix the alignment of the filter tables, so that we know the 0/16 is

+ *    always 256 byte aligned.

+ *

+ * Implementations 2 and 3 are likely preferable, as they avoid an extra 2

+ * parameters, and switching between them is trivial.

+ */

+static void convolve_horiz_c(const uint8_t *src, int src_stride,

+                             uint8_t *dst, int dst_stride,

+                             const int16_t *filter_x0, int x_step_q4,

+                             const int16_t *filter_y, int y_step_q4,

+                             int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_x_base = filter_x0;

+#if ALIGN_FILTERS_256

+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source line */

+  src -= taps / 2 - 1;

+  for (y = 0; y < h; ++y) {

+    /* Pointer to filter to use */

+    const int16_t *filter_x = filter_x0;

+    /* Initial phase offset */

+    int x_q4 = (filter_x - filter_x_base) / taps;

+    for (x = 0; x < w; ++x) {

+      /* Per-pixel src offset */

+      int src_x = x_q4 >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[src_x + k] * filter_x[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);

+      /* Adjust source and filter to use for the next pixel */

+      x_q4 += x_step_q4;

+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,

+                                 uint8_t *dst, int dst_stride,

+                                 const int16_t *filter_x0, int x_step_q4,

+                                 const int16_t *filter_y, int y_step_q4,

+                                 int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_x_base = filter_x0;

+#if ALIGN_FILTERS_256

+  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source line */

+  src -= taps / 2 - 1;

+  for (y = 0; y < h; ++y) {

+    /* Pointer to filter to use */

+    const int16_t *filter_x = filter_x0;

+    /* Initial phase offset */

+    int x_q4 = (filter_x - filter_x_base) / taps;

+    for (x = 0; x < w; ++x) {

+      /* Per-pixel src offset */

+      int src_x = x_q4 >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[src_x + k] * filter_x[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

+      /* Adjust source and filter to use for the next pixel */

+      x_q4 += x_step_q4;

+      filter_x = filter_x_base + (x_q4 & 0xf) * taps;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

+static void convolve_vert_c(const uint8_t *src, int src_stride,

+                            uint8_t *dst, int dst_stride,

+                            const int16_t *filter_x, int x_step_q4,

+                            const int16_t *filter_y0, int y_step_q4,

+                            int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_y_base = filter_y0;

+#if ALIGN_FILTERS_256

+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source column */

+  src -= src_stride * (taps / 2 - 1);

+  for (x = 0; x < w; ++x) {

+    /* Pointer to filter to use */

+    const int16_t *filter_y = filter_y0;

+    /* Initial phase offset */

+    int y_q4 = (filter_y - filter_y_base) / taps;

+    for (y = 0; y < h; ++y) {

+      /* Per-pixel src offset */

+      int src_y = y_q4 >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[(src_y + k) * src_stride] * filter_y[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);

+      /* Adjust source and filter to use for the next pixel */

+      y_q4 += y_step_q4;

+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;

+    }

+    ++src;

+    ++dst;

+  }

+}

+static void convolve_avg_vert_c(const uint8_t *src, int src_stride,

+                                uint8_t *dst, int dst_stride,

+                                const int16_t *filter_x, int x_step_q4,

+                                const int16_t *filter_y0, int y_step_q4,

+                                int w, int h, int taps) {

+  int x, y, k, sum;

+  const int16_t *filter_y_base = filter_y0;

+#if ALIGN_FILTERS_256

+  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);

+#endif

+  /* Adjust base pointer address for this source column */

+  src -= src_stride * (taps / 2 - 1);

+  for (x = 0; x < w; ++x) {

+    /* Pointer to filter to use */

+    const int16_t *filter_y = filter_y0;

+    /* Initial phase offset */

+    int y_q4 = (filter_y - filter_y_base) / taps;

+    for (y = 0; y < h; ++y) {

+      /* Per-pixel src offset */

+      int src_y = y_q4 >> 4;

+      for (sum = 0, k = 0; k < taps; ++k) {

+        sum += src[(src_y + k) * src_stride] * filter_y[k];

+      }

+      sum += (VP9_FILTER_WEIGHT >> 1);

+      dst[y * dst_stride] =

+          (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;

+      /* Adjust source and filter to use for the next pixel */

+      y_q4 += y_step_q4;

+      filter_y = filter_y_base + (y_q4 & 0xf) * taps;

+    }

+    ++src;

+    ++dst;

+  }

+}

+static void convolve_c(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h, int taps) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  uint8_t temp[16 * 23];

+  assert(w <= 16);

+  assert(h <= 16);

+  assert(taps <= 8);

+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,

+                   temp, 16,

+                   filter_x, x_step_q4, filter_y, y_step_q4,

+                   w, h + taps - 1, taps);

+  convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,

+                  filter_x, x_step_q4, filter_y, y_step_q4,

+                  w, h, taps);

+}

+static void convolve_avg_c(const uint8_t *src, int src_stride,

+                           uint8_t *dst, int dst_stride,

+                           const int16_t *filter_x, int x_step_q4,

+                           const int16_t *filter_y, int y_step_q4,

+                           int w, int h, int taps) {

+  /* Fixed size intermediate buffer places limits on parameters. */

+  uint8_t temp[16 * 23];

+  assert(w <= 16);

+  assert(h <= 16);

+  assert(taps <= 8);

+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,

+                   temp, 16,

+                   filter_x, x_step_q4, filter_y, y_step_q4,

+                   w, h + taps - 1, taps);

+  convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, taps);

+}

+void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,

+                           uint8_t *dst, int dst_stride,

+                           const int16_t *filter_x, int x_step_q4,

+                           const int16_t *filter_y, int y_step_q4,

+                           int w, int h) {

+  convolve_horiz_c(src, src_stride, dst, dst_stride,

+                   filter_x, x_step_q4, filter_y, y_step_q4,

+                   w, h, 8);

+}

+void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,

+                       filter_x, x_step_q4, filter_y, y_step_q4,

+                       w, h, 8);

+}

+void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,

+                          uint8_t *dst, int dst_stride,

+                          const int16_t *filter_x, int x_step_q4,

+                          const int16_t *filter_y, int y_step_q4,

+                          int w, int h) {

+  convolve_vert_c(src, src_stride, dst, dst_stride,

+                  filter_x, x_step_q4, filter_y, y_step_q4,

+                  w, h, 8);

+}

+void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h) {

+  convolve_avg_vert_c(src, src_stride, dst, dst_stride,

+                      filter_x, x_step_q4, filter_y, y_step_q4,

+                      w, h, 8);

+}

+void vp9_convolve8_c(const uint8_t *src, int src_stride,

+                     uint8_t *dst, int dst_stride,

+                     const int16_t *filter_x, int x_step_q4,

+                     const int16_t *filter_y, int y_step_q4,

+                     int w, int h) {

+  convolve_c(src, src_stride, dst, dst_stride,

+             filter_x, x_step_q4, filter_y, y_step_q4,

+             w, h, 8);

+}

+void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  convolve_avg_c(src, src_stride, dst, dst_stride,

+                 filter_x, x_step_q4, filter_y, y_step_q4,

+                 w, h, 8);

+}

+void vp9_convolve_copy(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int filter_x_stride,

+                       const int16_t *filter_y, int filter_y_stride,

+                       int w, int h) {

+  if (h == 16) {

+    vp9_copy_mem16x16(src, src_stride, dst, dst_stride);

+  } else if (h == 8) {

+    vp9_copy_mem8x8(src, src_stride, dst, dst_stride);

+  } else if (w == 8) {

+    vp9_copy_mem8x4(src, src_stride, dst, dst_stride);

+  } else {

+    // 4x4

+    int r;

+    for (r = 0; r < 4; ++r) {

+#if !(CONFIG_FAST_UNALIGNED)

+      dst[0]  = src[0];

+      dst[1]  = src[1];

+      dst[2]  = src[2];

+      dst[3]  = src[3];

+#else

+      *(uint32_t *)dst = *(const uint32_t *)src;

+#endif

+      src += src_stride;

+      dst += dst_stride;

+    }

+  }

+}

+void vp9_convolve_avg(const uint8_t *src, int src_stride,

+                      uint8_t *dst, int dst_stride,

+                      const int16_t *filter_x, int filter_x_stride,

+                      const int16_t *filter_y, int filter_y_stride,

+                      int w, int h) {

+  int x, y;

+  for (y = 0; y < h; ++y) {

+    for (x = 0; x < w; ++x) {

+      dst[x] = (dst[x] + src[x] + 1) >> 1;

+    }

+    src += src_stride;

+    dst += dst_stride;

+  }

+}

--- /dev/null

+++ b/vp9/common/vp9_convolve.h

@@ -1,0 +1,43 @@

+/*

+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#ifndef VP9_COMMON_CONVOLVE_H_

+#define VP9_COMMON_CONVOLVE_H_

+#include "vpx/vpx_integer.h"

+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h);

+// Not a convolution, a block copy conforming to the convolution prototype

+void vp9_convolve_copy(const uint8_t *src, int src_stride,

+                       uint8_t *dst, int dst_stride,

+                       const int16_t *filter_x, int x_step_q4,

+                       const int16_t *filter_y, int y_step_q4,

+                       int w, int h);

+// Not a convolution, a block average conforming to the convolution prototype

+void vp9_convolve_avg(const uint8_t *src, int src_stride,

+                      uint8_t *dst, int dst_stride,

+                      const int16_t *filter_x, int x_step_q4,

+                      const int16_t *filter_y, int y_step_q4,

+                      int w, int h);

+struct subpix_fn_table {

+  convolve_fn_t predict[2][2][2];  // horiz, vert, avg

+  const int16_t (*filter_x)[8];

+  const int16_t (*filter_y)[8];

+  int x_step_q4;

+  int y_step_q4;

+};

+#endif  // VP9_COMMON_CONVOLVE_H_

--- a/vp9/common/vp9_filter.c

+++ b/vp9/common/vp9_filter.c

@@ -15,23 +15,23 @@

 #include "vp9_rtcd.h"

 #include "vp9/common/vp9_common.h"

-DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {

-  { 128,   0 },

-  { 120,   8 },

-  { 112,  16 },

-  { 104,  24 },

-  {  96,  32 },

-  {  88,  40 },

-  {  80,  48 },

-  {  72,  56 },

-  {  64,  64 },

-  {  56,  72 },

-  {  48,  80 },

-  {  40,  88 },

-  {  32,  96 },

-  {  24, 104 },

-  {  16, 112 },

-  {   8, 120 }

+DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {

+  { 0, 0, 0, 128,   0, 0, 0, 0 },

+  { 0, 0, 0, 120,   8, 0, 0, 0 },

+  { 0, 0, 0, 112,  16, 0, 0, 0 },

+  { 0, 0, 0, 104,  24, 0, 0, 0 },

+  { 0, 0, 0,  96,  32, 0, 0, 0 },

+  { 0, 0, 0,  88,  40, 0, 0, 0 },

+  { 0, 0, 0,  80,  48, 0, 0, 0 },

+  { 0, 0, 0,  72,  56, 0, 0, 0 },

+  { 0, 0, 0,  64,  64, 0, 0, 0 },

+  { 0, 0, 0,  56,  72, 0, 0, 0 },

+  { 0, 0, 0,  48,  80, 0, 0, 0 },

+  { 0, 0, 0,  40,  88, 0, 0, 0 },

+  { 0, 0, 0,  32,  96, 0, 0, 0 },

+  { 0, 0, 0,  24, 104, 0, 0, 0 },

+  { 0, 0, 0,  16, 112, 0, 0, 0 },

+  { 0, 0, 0,   8, 120, 0, 0, 0 }

};

 #define FILTER_ALPHA       0

@@ -144,1072 +144,21 @@

   { 1, -2, -7, 37, 80, 28, -8, -1}

};

-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {

-  {0,   0, 128,   0,   0, 0},

-  {1,  -5, 125,   8,  -2, 1},

-  {1,  -8, 122,  17,  -5, 1},

-  {2, -11, 116,  27,  -8, 2},

-  {3, -14, 110,  37, -10, 2},

-  {3, -15, 103,  47, -12, 2},

-  {3, -16,  95,  57, -14, 3},

-  {3, -16,  86,  67, -15, 3},

-  {3, -16,  77,  77, -16, 3},

-  {3, -15,  67,  86, -16, 3},

-  {3, -14,  57,  95, -16, 3},

-  {2, -12,  47, 103, -15, 3},

-  {2, -10,  37, 110, -14, 3},

-  {2,  -8,  27, 116, -11, 2},

-  {1,  -5,  17, 122,  -8, 1},

-  {1,  -2,   8, 125,  -5, 1}

+DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) = {

+  {0, 0,   0, 128,   0,   0, 0,  0},

+  {0, 1,  -5, 125,   8,  -2, 1,  0},

+  {0, 1,  -8, 122,  17,  -5, 1,  0},

+  {0, 2, -11, 116,  27,  -8, 2,  0},

+  {0, 3, -14, 110,  37, -10, 2,  0},

+  {0, 3, -15, 103,  47, -12, 2,  0},

+  {0, 3, -16,  95,  57, -14, 3,  0},

+  {0, 3, -16,  86,  67, -15, 3,  0},

+  {0, 3, -16,  77,  77, -16, 3,  0},

+  {0, 3, -15,  67,  86, -16, 3,  0},

+  {0, 3, -14,  57,  95, -16, 3,  0},

+  {0, 2, -12,  47, 103, -15, 3,  0},

+  {0, 2, -10,  37, 110, -14, 3,  0},

+  {0, 2,  -8,  27, 116, -11, 2,  0},

+  {0, 1,  -5,  17, 122,  -8, 1,  0},

+  {0, 1,  -2,   8, 125,  -5, 1,  0}

};

-static void filter_block2d_first_pass_6(uint8_t *src_ptr,

-                                        int *output_ptr,

-                                        unsigned int src_pixels_per_line,

-                                        unsigned int pixel_step,

-                                        unsigned int output_height,

-                                        unsigned int output_width,

-                                        const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]       * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]       * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);      /* Rounding */

-      /* Normalize back to 0-255 */

-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    /* Next row... */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_width;

-  }

-}

-static void filter_block2d_second_pass_6(int *src_ptr,

-                                         uint8_t *output_ptr,

-                                         int output_pitch,

-                                         unsigned int src_pixels_per_line,

-                                         unsigned int pixel_step,

-                                         unsigned int output_height,

-                                         unsigned int output_width,

-                                         const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      /* Apply filter */

-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

-      /* Normalize back to 0-255 */

-      output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    /* Start next row */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_pitch;

-  }

-}

-/*

- * The only functional difference between filter_block2d_second_pass()

- * and this function is that filter_block2d_second_pass() does a sixtap

- * filter on the input and stores it in the output. This function

- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,

- * and then averages that with the content already present in the output

- * ((filter_result + dest + 1) >> 1) and stores that in the output.

- */

-static void filter_block2d_second_pass_avg_6(int *src_ptr,

-                                             uint8_t *output_ptr,

-                                             int output_pitch,

-                                             unsigned int src_pixels_per_line,

-                                             unsigned int pixel_step,

-                                             unsigned int output_height,

-                                             unsigned int output_width,

-                                             const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      /* Apply filter */

-      temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +

-             ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +

-             ((int)src_ptr[0]                    * vp9_filter[2]) +

-             ((int)src_ptr[pixel_step]           * vp9_filter[3]) +

-             ((int)src_ptr[2 * pixel_step]         * vp9_filter[4]) +

-             ((int)src_ptr[3 * pixel_step]         * vp9_filter[5]) +

-             (VP9_FILTER_WEIGHT >> 1);   /* Rounding */

-      /* Normalize back to 0-255 */

-      output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) +

-                       output_ptr[j] + 1) >> 1;

-      src_ptr++;

-    }

-    /* Start next row */

-    src_ptr    += src_pixels_per_line - output_width;

-    output_ptr += output_pitch;

-  }

-}

-#define Interp_Extend 3

-static void filter_block2d_6(uint8_t *src_ptr,

-                             uint8_t *output_ptr,

-                             unsigned int src_pixels_per_line,

-                             int output_pitch,

-                             const int16_t *HFilter,

-                             const int16_t *VFilter) {

-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr,

-                               output_pitch, 4, 4, 4, 4, VFilter);

-}

-void vp9_sixtap_predict4x4_c(uint8_t *src_ptr,

-                             int src_pixels_per_line,

-                             int xoffset,

-                             int yoffset,

-                             uint8_t *dst_ptr,

-                             int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,

-                   VFilter);

-}

-/*

- * The difference between filter_block2d_6() and filter_block2d_avg_6 is

- * that filter_block2d_6() does a 6-tap filter and stores it in the output

- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and

- * then averages that with the content already present in the output

- * ((filter_result + dest + 1) >> 1) and stores that in the output.

- */

-static void filter_block2d_avg_6(uint8_t *src_ptr,

-                                 uint8_t *output_ptr,

-                                 unsigned int src_pixels_per_line,

-                                 int output_pitch,

-                                 const int16_t *HFilter,

-                                 const int16_t *VFilter) {

-  int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,

-                                   output_pitch, 4, 4, 4, 4, VFilter);

-}

-void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,

-                       HFilter, VFilter);

-}

-void vp9_sixtap_predict8x8_c(uint8_t *src_ptr,

-                             int src_pixels_per_line,

-                             int xoffset,

-                             int yoffset,

-                             uint8_t *dst_ptr,

-                             int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,

-                               dst_pitch, 8, 8, 8, 8, VFilter);

-}

-void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr,

-                                   dst_pitch, 8, 8, 8, 8, VFilter);

-}

-void vp9_sixtap_predict8x4_c(uint8_t *src_ptr,

-                             int src_pixels_per_line,

-                             int xoffset,

-                             int yoffset,

-                             uint8_t *dst_ptr,

-                             int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,

-                               dst_pitch, 8, 8, 4, 8, VFilter);

-}

-void vp9_sixtap_predict16x16_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr,

-                               dst_pitch, 16, 16, 16, 16, VFilter);

-}

-void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */

-  HFilter = vp9_sub_pel_filters_6[xoffset];   /* 6 tap */

-  VFilter = vp9_sub_pel_filters_6[yoffset];   /* 6 tap */

-  /* First filter 1-D horizontally... */

-  filter_block2d_first_pass_6(

-      src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,

-      src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);

-  /* then filter vertically... */

-  filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr,

-                                   dst_pitch, 16, 16, 16, 16, VFilter);

-}

-typedef enum {

-  VPX_FILTER_4x4 = 0,

-  VPX_FILTER_8x8 = 1,

-  VPX_FILTER_8x4 = 2,

-  VPX_FILTER_16x16 = 3,

-} filter_size_t;

-static const unsigned int filter_size_to_wh[][2] = {

-  {4, 4},

-  {8, 8},

-  {8, 4},

-  {16,16},

-};

-static void filter_block2d_8_c(const uint8_t *src_ptr,

-                               const unsigned int src_stride,

-                               const int16_t *HFilter,

-                               const int16_t *VFilter,

-                               const filter_size_t filter_size,

-                               uint8_t *dst_ptr,

-                               unsigned int dst_stride) {

-  const unsigned int output_width = filter_size_to_wh[filter_size][0];

-  const unsigned int output_height = filter_size_to_wh[filter_size][1];

-  // Between passes, we use an intermediate buffer whose height is extended to

-  // have enough horizontally filtered values as input for the vertical pass.

-  // This buffer is allocated to be big enough for the largest block type we

-  // support.

-  const int kInterp_Extend = 4;

-  const unsigned int intermediate_height =

-    (kInterp_Extend - 1) +     output_height + kInterp_Extend;

-  /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,

-   * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height

-   *                                 + kInterp_Extend

-   *                               = 3 + 16 + 4

-   *                               = 23

-   * and filter_max_width = 16

-   */

-  uint8_t intermediate_buffer[23 * 16];

-  const int intermediate_next_stride = 1 - intermediate_height * output_width;

-  // Horizontal pass (src -> transposed intermediate).

-  {

-    uint8_t *output_ptr = intermediate_buffer;

-    const int src_next_row_stride = src_stride - output_width;

-    unsigned int i, j;

-    src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-    for (i = 0; i < intermediate_height; i++) {

-      for (j = 0; j < output_width; j++) {

-        // Apply filter...

-        int temp = ((int)src_ptr[0] * HFilter[0]) +

-                   ((int)src_ptr[1] * HFilter[1]) +

-                   ((int)src_ptr[2] * HFilter[2]) +

-                   ((int)src_ptr[3] * HFilter[3]) +

-                   ((int)src_ptr[4] * HFilter[4]) +

-                   ((int)src_ptr[5] * HFilter[5]) +

-                   ((int)src_ptr[6] * HFilter[6]) +

-                   ((int)src_ptr[7] * HFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1); // Rounding

-        // Normalize back to 0-255...

-        *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);

-        src_ptr++;

-        output_ptr += intermediate_height;

-      }

-      src_ptr += src_next_row_stride;

-      output_ptr += intermediate_next_stride;

-    }

-  }

-  // Vertical pass (transposed intermediate -> dst).

-  {

-    uint8_t *src_ptr = intermediate_buffer;

-    const int dst_next_row_stride = dst_stride - output_width;

-    unsigned int i, j;

-    for (i = 0; i < output_height; i++) {

-      for (j = 0; j < output_width; j++) {

-        // Apply filter...

-        int temp = ((int)src_ptr[0] * VFilter[0]) +

-                   ((int)src_ptr[1] * VFilter[1]) +

-                   ((int)src_ptr[2] * VFilter[2]) +

-                   ((int)src_ptr[3] * VFilter[3]) +

-                   ((int)src_ptr[4] * VFilter[4]) +

-                   ((int)src_ptr[5] * VFilter[5]) +

-                   ((int)src_ptr[6] * VFilter[6]) +

-                   ((int)src_ptr[7] * VFilter[7]) +

-                   (VP9_FILTER_WEIGHT >> 1); // Rounding

-        // Normalize back to 0-255...

-        *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);

-        src_ptr += intermediate_height;

-      }

-      src_ptr += intermediate_next_stride;

-      dst_ptr += dst_next_row_stride;

-    }

-  }

-}

-void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr,

-                                const unsigned int src_stride,

-                                const int16_t *HFilter_aligned16,

-                                const int16_t *VFilter_aligned16,

-                                uint8_t *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_4x4, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr,

-                                const unsigned int src_stride,

-                                const int16_t *HFilter_aligned16,

-                                const int16_t *VFilter_aligned16,

-                                uint8_t *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_8x4, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr,

-                                const unsigned int src_stride,

-                                const int16_t *HFilter_aligned16,

-                                const int16_t *VFilter_aligned16,

-                                uint8_t *dst_ptr,

-                                unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_8x8, dst_ptr, dst_stride);

-}

-void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr,

-                                  const unsigned int src_stride,

-                                  const int16_t *HFilter_aligned16,

-                                  const int16_t *VFilter_aligned16,

-                                  uint8_t *dst_ptr,

-                                  unsigned int dst_stride) {

-  filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,

-                     VPX_FILTER_16x16, dst_ptr, dst_stride);

-}

-static void block2d_average_c(uint8_t *src,

-                              unsigned int src_stride,

-                              uint8_t *output_ptr,

-                              unsigned int output_stride,

-                              const filter_size_t filter_size) {

-  const unsigned int output_width = filter_size_to_wh[filter_size][0];

-  const unsigned int output_height = filter_size_to_wh[filter_size][1];

-  unsigned int i, j;

-  for (i = 0; i < output_height; i++) {

-    for (j = 0; j < output_width; j++) {

-      output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;

-    }

-    output_ptr += output_stride;

-  }

-}

-#define block2d_average block2d_average_c

-void vp9_eighttap_predict4x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_8[xoffset];

-  VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  uint8_t tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_8s[xoffset];

-  VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr,

-                                      int src_pixels_per_line,

-                                      int xoffset,

-                                      int yoffset,

-                                      uint8_t *dst_ptr,

-                                      int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,

-                           HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr,

-                                         int src_pixels_per_line,

-                                         int xoffset,

-                                         int yoffset,

-                                         uint8_t *dst_ptr,

-                                         int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  uint8_t tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr,

-                                          int src_pixels_per_line,

-                                          int xoffset,

-                                          int yoffset,

-                                          uint8_t *dst_ptr,

-                                          int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  uint8_t tmp[4 * 4];

-  vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           4);

-  block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);

-}

-void vp9_eighttap_predict8x8_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr,

-                                      int src_pixels_per_line,

-                                      int xoffset,

-                                      int yoffset,

-                                      uint8_t *dst_ptr,

-                                      int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  uint8_t tmp[8 * 8];

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr,

-                                         int src_pixels_per_line,

-                                         int xoffset,

-                                         int yoffset,

-                                         uint8_t *dst_ptr,

-                                         int dst_pitch) {

-  uint8_t tmp[8 * 8];

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr,

-                                          int src_pixels_per_line,

-                                          int xoffset,

-                                          int yoffset,

-                                          uint8_t *dst_ptr,

-                                          int dst_pitch) {

-  uint8_t tmp[8 * 8];

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,

-                           8);

-  block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);

-}

-void vp9_eighttap_predict8x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr,

-                                      int src_pixels_per_line,

-                                      int xoffset,

-                                      int yoffset,

-                                      uint8_t *dst_ptr,

-                                      int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                           dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr,

-                                       int src_pixels_per_line,

-                                       int xoffset,

-                                       int yoffset,

-                                       uint8_t *dst_ptr,

-                                       int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr,

-                                        int src_pixels_per_line,

-                                        int xoffset,

-                                        int yoffset,

-                                        uint8_t *dst_ptr,

-                                        int dst_pitch) {

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             dst_ptr, dst_pitch);

-}

-void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);

-  const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr,

-                                           int src_pixels_per_line,

-                                           int xoffset,

-                                           int yoffset,

-                                           uint8_t *dst_ptr,

-                                           int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);

-  const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr,

-                                            int src_pixels_per_line,

-                                            int xoffset,

-                                            int yoffset,

-                                            uint8_t *dst_ptr,

-                                            int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);

-  const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];

-  const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];

-  vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,

-                             tmp, 16);

-  block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_first_pass

- *

- *  INPUTS        : uint8_t  *src_ptr    : Pointer to source block.

- *                  uint32_t  src_stride : Stride of source block.

- *                  uint32_t  height     : Block height.

- *                  uint32_t  width      : Block width.

- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : int32_t  *dst_ptr    : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

- *                  in the horizontal direction to produce the filtered output

- *                  block. Used to implement first-pass of 2-D separable filter.

- *

- *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *

- ****************************************************************************/

-static void filter_block2d_bil_first_pass(uint8_t *src_ptr,

-                                          uint16_t *dst_ptr,

-                                          unsigned int src_stride,

-                                          unsigned int height,

-                                          unsigned int width,

-                                          const int16_t *vp9_filter) {

-  unsigned int i, j;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply bilinear filter */

-      dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +

-                    ((int)src_ptr[1] * vp9_filter[1]) +

-                    (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

-      src_ptr++;

-    }

-    /* Next row... */

-    src_ptr += src_stride - width;

-    dst_ptr += width;

-  }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil_second_pass

- *

- *  INPUTS        : int32_t  *src_ptr    : Pointer to source block.

- *                  uint32_t  dst_pitch  : Destination block pitch.

- *                  uint32_t  height     : Block height.

- *                  uint32_t  width      : Block width.

- *                  int32_t  *vp9_filter : Array of 2 bi-linear filter taps.

- *

- *  OUTPUTS       : uint16_t *dst_ptr    : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block

- *                  in the vertical direction to produce the filtered output

- *                  block. Used to implement second-pass of 2-D separable filter.

- *

- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.

- *                  Two filter taps should sum to VP9_FILTER_WEIGHT.

- *

- ****************************************************************************/

-static void filter_block2d_bil_second_pass(uint16_t *src_ptr,

-                                           uint8_t *dst_ptr,

-                                           int dst_pitch,

-                                           unsigned int height,

-                                           unsigned int width,

-                                           const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply filter */

-      temp = ((int)src_ptr[0]     * vp9_filter[0]) +

-             ((int)src_ptr[width] * vp9_filter[1]) +

-             (VP9_FILTER_WEIGHT / 2);

-      dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT);

-      src_ptr++;

-    }

-    /* Next row... */

-    dst_ptr += dst_pitch;

-  }

-}

-/*

- * As before for filter_block2d_second_pass_avg(), the functional difference

- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()

- * is that filter_block2d_bil_second_pass() does a bilinear filter on input

- * and stores the result in output; filter_block2d_bil_second_pass_avg(),

- * instead, does a bilinear filter on input, averages the resulting value

- * with the values already present in the output and stores the result of

- * that back into the output ((filter_result + dest + 1) >> 1).

- */

-static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr,

-                                               uint8_t *dst_ptr,

-                                               int dst_pitch,

-                                               unsigned int height,

-                                               unsigned int width,

-                                               const int16_t *vp9_filter) {

-  unsigned int i, j;

-  int temp;

-  for (i = 0; i < height; i++) {

-    for (j = 0; j < width; j++) {

-      /* Apply filter */

-      temp = (((int)src_ptr[0]     * vp9_filter[0]) +

-              ((int)src_ptr[width] * vp9_filter[1]) +

-              (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;

-      dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1);

-      src_ptr++;

-    }

-    /* Next row... */

-    dst_ptr += dst_pitch;

-  }

-}

-/****************************************************************************

- *

- *  ROUTINE       : filter_block2d_bil

- *

- *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.

- *                  uint32_t  src_pitch        : Stride of source block.

- *                  uint32_t  dst_pitch        : Stride of destination block.

- *                  int32_t  *HFilter          : Array of 2 horizontal filter taps.

- *                  int32_t  *VFilter          : Array of 2 vertical filter taps.

- *                  int32_t  Width             : Block width

- *                  int32_t  Height            : Block height

- *

- *  OUTPUTS       : uint16_t *dst_ptr       : Pointer to filtered block.

- *

- *  RETURNS       : void

- *

- *  FUNCTION      : 2-D filters an input block by applying a 2-tap

- *                  bi-linear filter horizontally followed by a 2-tap

- *                  bi-linear filter vertically on the result.

- *

- *  SPECIAL NOTES : The largest block size can be handled here is 16x16

- *

- ****************************************************************************/

-static void filter_block2d_bil(uint8_t *src_ptr,

-                               uint8_t *dst_ptr,

-                               unsigned int src_pitch,

-                               unsigned int dst_pitch,

-                               const int16_t *HFilter,

-                               const int16_t *VFilter,

-                               int Width,

-                               int Height) {

-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

-  /* then 1-D vertically... */

-  filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

-}

-static void filter_block2d_bil_avg(uint8_t *src_ptr,

-                                   uint8_t *dst_ptr,

-                                   unsigned int src_pitch,

-                                   unsigned int dst_pitch,

-                                   const int16_t *HFilter,

-                                   const int16_t *VFilter,

-                                   int Width,

-                                   int Height) {

-  uint16_t FData[17 * 16];  /* Temp data buffer used in filtering */

-  /* First filter 1-D horizontally... */

-  filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);

-  /* then 1-D vertically... */

-  filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);

-}

-void vp9_bilinear_predict4x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 4, 4);

-}

-void vp9_bilinear_predict8x8_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr,

-                                   int src_pixels_per_line,

-                                   int xoffset,

-                                   int yoffset,

-                                   uint8_t *dst_ptr,

-                                   int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 8, 8);

-}

-void vp9_bilinear_predict8x4_c(uint8_t *src_ptr,

-                               int src_pixels_per_line,

-                               int xoffset,

-                               int yoffset,

-                               uint8_t *dst_ptr,

-                               int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);

-}

-void vp9_bilinear_predict16x16_c(uint8_t *src_ptr,

-                                 int src_pixels_per_line,

-                                 int xoffset,

-                                 int yoffset,

-                                 uint8_t *dst_ptr,

-                                 int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);

-}

-void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr,

-                                     int src_pixels_per_line,

-                                     int xoffset,

-                                     int yoffset,

-                                     uint8_t *dst_ptr,

-                                     int dst_pitch) {

-  const int16_t *HFilter;

-  const int16_t *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

-  filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,

-                         dst_pitch, HFilter, VFilter, 16, 16);

-}

--- a/vp9/common/vp9_filter.h

+++ b/vp9/common/vp9_filter.h

@@ -21,10 +21,17 @@

 #define SUBPEL_SHIFTS 16

-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];

-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];

+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];

+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];

 extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];

 extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];

 extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];

+// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear

+// filter kernel as a 2 tap filter.

+#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \

+                   sizeof(vp9_bilinear_filters[0][0]))

+#define BF_OFFSET (BF_LENGTH / 2 - 1)

+#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)

 #endif  // VP9_COMMON_VP9_FILTER_H_

--- a/vp9/common/vp9_findnearmv.c

+++ b/vp9/common/vp9_findnearmv.c

@@ -87,8 +87,8 @@

   uint8_t temp2[2 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3,

                                     src_pixels_per_line, 1, 3, 16, HFilter);

@@ -108,8 +108,8 @@

   uint8_t temp2[2 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3,

                                     src_pixels_per_line, 1, 17, 2, HFilter);

--- a/vp9/common/vp9_reconinter.c

+++ b/vp9/common/vp9_reconinter.c

@@ -8,10 +8,12 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include "./vpx_config.h"

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_blockd.h"

+#include "vp9/common/vp9_filter.h"

 #include "vp9/common/vp9_reconinter.h"

 #include "vp9/common/vp9_reconintra.h"

@@ -18,56 +20,46 @@

 void vp9_setup_interp_filters(MACROBLOCKD *xd,

                               INTERPOLATIONFILTERTYPE mcomp_filter_type,

                               VP9_COMMON *cm) {

+  // TODO(agrange): Investigate the best choice of functions to use here

+  // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what

+  // to do at full-pel offsets. The current selection, where the filter is

+  // applied in one direction only, and not at all for 0,0, seems to give the

+  // best quality, but it may be worth trying an additional mode that does

+  // do the filtering on full-pel.

+  xd->subpix.predict[0][0][0] = vp9_convolve_copy;

+  xd->subpix.predict[0][0][1] = vp9_convolve_avg;

+  xd->subpix.predict[0][1][0] = vp9_convolve8_vert;

+  xd->subpix.predict[0][1][1] = vp9_convolve8_avg_vert;

+  xd->subpix.predict[1][0][0] = vp9_convolve8_horiz;

+  xd->subpix.predict[1][0][1] = vp9_convolve8_avg_horiz;

+  xd->subpix.predict[1][1][0] = vp9_convolve8;

+  xd->subpix.predict[1][1][1] = vp9_convolve8_avg;

+  xd->subpix.x_step_q4 = 16;

+  xd->subpix.y_step_q4 = 16;

+  switch (mcomp_filter_type) {

+    case EIGHTTAP:

+    case SWITCHABLE:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;

+      break;

+    case EIGHTTAP_SMOOTH:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;

+      break;

+    case EIGHTTAP_SHARP:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;

+      break;

+    case BILINEAR:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;

+      break;

 #if CONFIG_ENABLE_6TAP

-  if (mcomp_filter_type == SIXTAP) {

-    xd->subpixel_predict4x4     = vp9_sixtap_predict4x4;

-    xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;

-    xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;

-    xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;

-    xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;

-    xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;

-    xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;

-  } else {

+    case SIXTAP:

+      xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;

+      break;

 #endif

-  if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {

-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4;

-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;

-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;

-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;

-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;

-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;

-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;

-  } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {

-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_smooth;

-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_smooth;

-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_smooth;

-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_smooth;

-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;

-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;

-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;

-  } else if (mcomp_filter_type == EIGHTTAP_SHARP) {

-    xd->subpixel_predict4x4     = vp9_eighttap_predict4x4_sharp;

-    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;

-    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;

-    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;

-    xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;

-    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;

-    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;

-  } else {

-    xd->subpixel_predict4x4     = vp9_bilinear_predict4x4;

-    xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;

-    xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;

-    xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;

-    xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;

-    xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;

-    xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;

-#if CONFIG_ENABLE_6TAP

-  }

-#endif

-void vp9_copy_mem16x16_c(uint8_t *src,

+void vp9_copy_mem16x16_c(const uint8_t *src,

                          int src_stride,

                          uint8_t *dst,

                          int dst_stride) {

@@ -93,10 +85,10 @@

     dst[15] = src[15];

 #else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

-    ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];

-    ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];

+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];

+    ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];

+    ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];

 #endif

     src += src_stride;

@@ -104,25 +96,7 @@

-void vp9_avg_mem16x16_c(uint8_t *src,

-                        int src_stride,

-                        uint8_t *dst,

-                        int dst_stride) {

-  int r;

-  for (r = 0; r < 16; r++) {

-    int n;

-    for (n = 0; n < 16; n++) {

-      dst[n] = (dst[n] + src[n] + 1) >> 1;

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_copy_mem8x8_c(uint8_t *src,

+void vp9_copy_mem8x8_c(const uint8_t *src,

                        int src_stride,

                        uint8_t *dst,

                        int dst_stride) {

@@ -139,8 +113,8 @@

     dst[6] = src[6];

     dst[7] = src[7];

 #else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];

 #endif

     src += src_stride;

     dst += dst_stride;

@@ -147,25 +121,7 @@

-void vp9_avg_mem8x8_c(uint8_t *src,

-                      int src_stride,

-                      uint8_t *dst,

-                      int dst_stride) {

-  int r;

-  for (r = 0; r < 8; r++) {

-    int n;

-    for (n = 0; n < 8; n++) {

-      dst[n] = (dst[n] + src[n] + 1) >> 1;

-    }

-    src += src_stride;

-    dst += dst_stride;

-  }

-}

-void vp9_copy_mem8x4_c(uint8_t *src,

+void vp9_copy_mem8x4_c(const uint8_t *src,

                        int src_stride,

                        uint8_t *dst,

                        int dst_stride) {

@@ -182,8 +138,8 @@

     dst[6] = src[6];

     dst[7] = src[7];

 #else

-    ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];

-    ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];

+    ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];

+    ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];

 #endif

     src += src_stride;

     dst += dst_stride;

@@ -190,8 +146,8 @@

-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {

-  int r;

+void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,

+                                  struct subpix_fn_table *subpix) {

   uint8_t *ptr_base;

   uint8_t *ptr;

   uint8_t *pred_ptr = d->predictor;

@@ -199,30 +155,14 @@

   ptr_base = *(d->base_pre);

   mv.as_int = d->bmi.as_mv.first.as_int;

+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+        (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-          (mv.as_mv.col >> 3);

-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

-         pred_ptr, pitch);

-  } else {

-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-                (mv.as_mv.col >> 3);

-    ptr = ptr_base;

-    for (r = 0; r < 4; r++) {

-#if !(CONFIG_FAST_UNALIGNED)

-      pred_ptr[0]  = ptr[0];

-      pred_ptr[1]  = ptr[1];

-      pred_ptr[2]  = ptr[2];

-      pred_ptr[3]  = ptr[3];

-#else

-      *(uint32_t *)pred_ptr = *(uint32_t *)ptr;

-#endif

-      pred_ptr     += pitch;

-      ptr         += d->pre_stride;

-    }

-  }

+  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](

+      ptr, d->pre_stride, pred_ptr, pitch,

+      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,

+      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,

+      4, 4);

/*

@@ -232,8 +172,7 @@

  * predictor of the second reference frame / motion vector.

*/

 void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

-                                      vp9_subpix_fn_t sppf) {

-  int r;

+                                      struct subpix_fn_table *subpix) {

   uint8_t *ptr_base;

   uint8_t *ptr;

   uint8_t *pred_ptr = d->predictor;

@@ -241,26 +180,14 @@

   ptr_base = *(d->base_second_pre);

   mv.as_int = d->bmi.as_mv.second.as_int;

+  ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

+        (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-          (mv.as_mv.col >> 3);

-    sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,

-         pred_ptr, pitch);

-  } else {

-    ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

-                (mv.as_mv.col >> 3);

-    ptr = ptr_base;

-    for (r = 0; r < 4; r++) {

-      pred_ptr[0]  = (pred_ptr[0] + ptr[0] + 1) >> 1;

-      pred_ptr[1]  = (pred_ptr[1] + ptr[1] + 1) >> 1;

-      pred_ptr[2]  = (pred_ptr[2] + ptr[2] + 1) >> 1;

-      pred_ptr[3]  = (pred_ptr[3] + ptr[3] + 1) >> 1;

-      pred_ptr    += pitch;

-      ptr         += d->pre_stride;

-    }

-  }

+  subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](

+      ptr, d->pre_stride, pred_ptr, pitch,

+      subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,

+      subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,

+      4, 4);

 void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

@@ -274,12 +201,11 @@

   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

         (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                            (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

-  }

+  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](

+      ptr, d->pre_stride, pred_ptr, pitch,

+      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,

+      8, 8);

/*

@@ -300,12 +226,11 @@

   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

         (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                               (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);

-  }

+  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](

+      ptr, d->pre_stride, pred_ptr, pitch,

+      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,

+      8, 8);

 static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {

@@ -319,12 +244,11 @@

   ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +

         (mv.as_mv.col >> 3);

-  if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {

-    xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,

-                           (mv.as_mv.row & 7) << 1, pred_ptr, pitch);

-  } else {

-    vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);

-  }

+  xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](

+      ptr, d->pre_stride, pred_ptr, pitch,

+      xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,

+      8, 4);

 /*encoder only*/

@@ -411,13 +335,13 @@

     if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

       build_inter_predictors2b(xd, d0, 8);

     else {

-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);

-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);

+      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);

+      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);

     if (xd->mode_info_context->mbmi.second_ref_frame > 0) {

-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);

-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);

+      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);

+      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);

@@ -475,14 +399,11 @@

   ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);

-    if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {

-      xd->subpixel_predict16x16(ptr, pre_stride,

-                                (ymv.as_mv.col & 7) << 1,

-                                (ymv.as_mv.row & 7) << 1,

-                                dst_y, dst_ystride);

-    } else {

-      vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

-    }

+  xd->subpix.predict[!!(ymv.as_mv.col & 7)][!!(ymv.as_mv.row & 7)][0](

+      ptr, pre_stride, dst_y, dst_ystride,

+      xd->subpix.filter_x[(ymv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(ymv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,

+      16, 16);

 void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

@@ -523,15 +444,19 @@

   uptr = xd->pre.u_buffer + offset;

   vptr = xd->pre.v_buffer + offset;

-    if (_o16x16mv.as_int & 0x000f000f) {

-      xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,

-                              _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);

-      xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,

-                              _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);

-    } else {

-      vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

-      vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

-    }

+  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]

+                    [!!(_o16x16mv.as_mv.row & 15)][0](

+      uptr, pre_stride, dst_u, dst_uvstride,

+      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,

+      8, 8);

+  xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]

+                    [!!(_o16x16mv.as_mv.row & 15)][0](

+      vptr, pre_stride, dst_v, dst_uvstride,

+      xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,

+      8, 8);

@@ -714,12 +639,11 @@

   ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);

-  if ((mv_row | mv_col) & 7) {

-    xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,

-                                  (mv_row & 7) << 1, dst_y, dst_ystride);

-  } else {

-    vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);

-  }

+  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][1](

+      ptr, pre_stride, dst_y, dst_ystride,

+      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,

+      16, 16);

 void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,

@@ -758,15 +682,17 @@

   uptr = xd->second_pre.u_buffer + offset;

   vptr = xd->second_pre.v_buffer + offset;

-    if ((omv_row | omv_col) & 15) {

-      xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,

-                                  omv_row & 15, dst_u, dst_uvstride);

-      xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,

-                                  omv_row & 15, dst_v, dst_uvstride);

-    } else {

-      vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);

-      vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);

-    }

+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](

+      uptr, pre_stride, dst_u, dst_uvstride,

+      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,

+      8, 8);

+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](

+      vptr, pre_stride, dst_v, dst_uvstride,

+      xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,

+      8, 8);

 void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,

@@ -835,13 +761,13 @@

       if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

         build_inter_predictors2b(xd, d0, 16);

       else {

-        vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);

-        vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);

+        vp9_build_inter_predictors_b(d0, 16, &xd->subpix);

+        vp9_build_inter_predictors_b(d1, 16, &xd->subpix);

       if (mbmi->second_ref_frame > 0) {

-        vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);

-        vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);

+        vp9_build_2nd_inter_predictors_b(d0, 16, &xd->subpix);

+        vp9_build_2nd_inter_predictors_b(d1, 16, &xd->subpix);

@@ -853,13 +779,13 @@

     if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)

       build_inter_predictors2b(xd, d0, 8);

     else {

-      vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);

-      vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);

+      vp9_build_inter_predictors_b(d0, 8, &xd->subpix);

+      vp9_build_inter_predictors_b(d1, 8, &xd->subpix);

     if (mbmi->second_ref_frame > 0) {

-      vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);

-      vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);

+      vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);

+      vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);

--- a/vp9/common/vp9_reconinter.h

+++ b/vp9/common/vp9_reconinter.h

@@ -14,6 +14,8 @@

 #include "vpx/vpx_integer.h"

 #include "vp9/common/vp9_onyxc_int.h"

+struct subpix_fn_table;

 extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,

                                                     uint8_t *dst_y,

                                                     int dst_ystride,

@@ -64,10 +66,10 @@

 extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);

 extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,

-                                         vp9_subpix_fn_t sppf);

+                                         struct subpix_fn_table *sppf);

 extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,

-                                             vp9_subpix_fn_t sppf);

+                                             struct subpix_fn_table *sppf);

 extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,

                                          int pitch);

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -23,21 +23,6 @@

 forward_decls vp9_common_forward_decls

-prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"

-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code

-# compiles warning free but a dissassembly of generated code show bugs. To be

-# on the safe side, only enabled when compiled with 'gcc'.

-if [ "$CONFIG_GCC" = "yes" ]; then

-    specialize vp9_filter_block2d_4x4_8 sse4_1 sse2

-fi

-    specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2

-    specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2

-    specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2

 # Dequant

@@ -86,27 +71,17 @@

 # RECON

-prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

+prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

 specialize vp9_copy_mem16x16 mmx sse2 dspr2

 vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2

-prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

+prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

 specialize vp9_copy_mem8x8 mmx dspr2

 vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2

-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

+prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

 specialize vp9_copy_mem8x4 mmx

-prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

-specialize vp9_avg_mem16x16

-prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

-specialize vp9_avg_mem8x8

-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"

-specialize vp9_copy_mem8x4 mmx dspr2

-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2

 prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"

 specialize vp9_recon_b

@@ -269,110 +244,23 @@

 # Sub Pixel Filters

-prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict16x16

+prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8 ssse3

-prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x8

+prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_horiz ssse3

-prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg16x16

+prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_vert ssse3

-prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg8x8

+prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_avg

-prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg4x4

+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_avg_horiz

-prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x4

-prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict4x4

-prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict16x16_sharp

-prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x8_sharp

-prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg16x16_sharp

-prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg8x8_sharp

-prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg4x4_sharp

-prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x4_sharp

-prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict4x4_sharp

-prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict16x16_smooth

-prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x8_smooth

-prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg16x16_smooth

-prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg8x8_smooth

-prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict_avg4x4_smooth

-prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict8x4_smooth

-prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_eighttap_predict4x4_smooth

-prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict16x16

-prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict8x8

-prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict_avg16x16

-prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict_avg8x8

-prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict8x4

-prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict4x4

-prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_sixtap_predict_avg4x4

-prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict16x16 sse2

-prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict8x8 sse2

-prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict_avg16x16

-prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict_avg8x8

-prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict8x4

-prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict4x4

-prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, uint8_t *dst_ptr, int  dst_pitch"

-specialize vp9_bilinear_predict_avg4x4

+prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"

+specialize vp9_convolve8_avg_vert

 # dct

--- a/vp9/common/vp9_subpixel.h

+++ /dev/null

@@ -1,20 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_VP9_SUBPIXEL_H_

-#define VP9_COMMON_VP9_SUBPIXEL_H_

-#define prototype_subpixel_predict(sym) \

-  void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \

-           uint8_t *dst, int dst_pitch)

-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));

-#endif  // VP9_COMMON_VP9_SUBPIXEL_H_

--- a/vp9/common/x86/vp9_asm_stubs.c

+++ b/vp9/common/x86/vp9_asm_stubs.c

@@ -8,91 +8,11 @@

  *  be found in the AUTHORS file in the root of the source tree.

*/

+#include <assert.h>

 #include "./vpx_config.h"

+#include "./vp9_rtcd.h"

 #include "vpx_ports/mem.h"

-#include "vp9/common/vp9_subpixel.h"

-extern const short vp9_six_tap_mmx[8][6 * 8];

-extern void vp9_filter_block1d_h6_mmx(unsigned char   *src_ptr,

-                                      unsigned short  *output_ptr,

-                                      unsigned int     src_pixels_per_line,

-                                      unsigned int     pixel_step,

-                                      unsigned int     output_height,

-                                      unsigned int     output_width,

-                                      const short     *vp9_filter);

-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,

-                                       unsigned char  *output_ptr,

-                                       int             output_pitch,

-                                       unsigned int    pixels_per_line,

-                                       unsigned int    pixel_step,

-                                       unsigned int    output_height,

-                                       unsigned int    output_width,

-                                       const short    *vp9_filter);

-extern void vp9_filter_block1d8_h6_sse2(unsigned char  *src_ptr,

-                                        unsigned short *output_ptr,

-                                        unsigned int    src_pixels_per_line,

-                                        unsigned int    pixel_step,

-                                        unsigned int    output_height,

-                                        unsigned int    output_width,

-                                        const short    *vp9_filter);

-extern void vp9_filter_block1d16_h6_sse2(unsigned char  *src_ptr,

-                                         unsigned short *output_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned int    pixel_step,

-                                         unsigned int    output_height,

-                                         unsigned int    output_width,

-                                         const short    *vp9_filter);

-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,

-                                        unsigned char *output_ptr,

-                                        int dst_ptich,

-                                        unsigned int pixels_per_line,

-                                        unsigned int pixel_step,

-                                        unsigned int output_height,

-                                        unsigned int output_width,

-                                        const short    *vp9_filter);

-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,

-                                         unsigned char *output_ptr,

-                                         int dst_ptich,

-                                         unsigned int pixels_per_line,

-                                         unsigned int pixel_step,

-                                         unsigned int output_height,

-                                         unsigned int output_width,

-                                         const short    *vp9_filter);

-extern void vp9_unpack_block1d16_h6_sse2(unsigned char  *src_ptr,

-                                         unsigned short *output_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned int    output_height,

-                                         unsigned int    output_width);

-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,

-                                             unsigned int   src_pixels_per_line,

-                                             unsigned char *output_ptr,

-                                             int            dst_pitch,

-                                             unsigned int   output_height,

-                                             const short   *vp9_filter);

-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,

-                                              unsigned int   src_pixels_per_lin,

-                                              unsigned char *output_ptr,

-                                              int            dst_pitch,

-                                              unsigned int   output_height,

-                                              const short   *vp9_filter);

-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,

-                                             unsigned int   src_pixels_per_line,

-                                             unsigned char *output_ptr,

-                                             int            dst_pitch,

-                                             unsigned int   output_height,

-                                             const short   *vp9_filter);

 ///////////////////////////////////////////////////////////////////////////

 // the mmx function that does the bilinear filtering and var calculation //

 // int one pass                                                          //

@@ -116,389 +36,7 @@

   {   8,  8,  8,  8, 120, 120, 120, 120 }

};

-#if HAVE_MMX

-void vp9_sixtap_predict4x4_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict4x4_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,

-                            src_pixels_per_line, 1, 9, 8, hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,

-                             8, 4, 4, 4, vfilter);

-}

-void vp9_sixtap_predict16x16_mmx(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,

-                            fdata2 + 8, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,

-                            fdata2 + 12, src_pixels_per_line, 1, 21, 32,

-                            hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr,      dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4,  dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8,  dst_pitch,

-                             32, 16, 16, 16, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,

-                             32, 16, 16, 16, vfilter);

-}

-void vp9_sixtap_predict8x8_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 13, 16,

-                            hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 13, 16,

-                            hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

-                             16, 8, 8, 8, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

-                             16, 8, 8, 8, vfilter);

-}

-void vp9_sixtap_predict8x4_mmx(unsigned char  *src_ptr,

-                               int  src_pixels_per_line,

-                               int  xoffset,

-                               int  yoffset,

-                               unsigned char *dst_ptr,

-                               int  dst_pitch) {

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_mmx\n");

-#endif

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-  hfilter = vp9_six_tap_mmx[xoffset];

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),

-                            fdata2,   src_pixels_per_line, 1, 9, 16, hfilter);

-  vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,

-                            fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);

-  vfilter = vp9_six_tap_mmx[yoffset];

-  vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr,     dst_pitch,

-                             16, 8, 4, 8, vfilter);

-  vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,

-                             16, 8, 4, 8, vfilter);

-}

-#endif

-#if HAVE_SSE2

-void vp9_sixtap_predict16x16_sse2(unsigned char  *src_ptr,

-                                  int  src_pixels_per_line,

-                                  int  xoffset,

-                                  int  yoffset,

-                                  unsigned char *dst_ptr,

-                                  int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                   src_pixels_per_line, 1, 21, 32, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

-                                   32, 16, 16, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                        dst_ptr, dst_pitch, 16, hfilter);

-    }

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                 src_pixels_per_line, 21, 32);

-    vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,

-                                 32, 16, 16, dst_pitch, vfilter);

-  }

-}

-void vp9_sixtap_predict8x8_sse2(unsigned char  *src_ptr,

-                                int  src_pixels_per_line,

-                                int  xoffset,

-                                int  yoffset,

-                                unsigned char *dst_ptr,

-                                int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                  src_pixels_per_line, 1, 13, 16, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

-                                  16, 8, 8, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                       dst_ptr, dst_pitch, 8, hfilter);

-    }

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

-                                     src_pixels_per_line,

-                                     dst_ptr, dst_pitch, 8, vfilter);

-  }

-}

-void vp9_sixtap_predict8x4_sse2(unsigned char  *src_ptr,

-                                int  src_pixels_per_line,

-                                int  xoffset,

-                                int  yoffset,

-                                unsigned char *dst_ptr,

-                                int  dst_pitch) {

-  /* Temp data bufffer used in filtering */

-  DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);

-  const short *hfilter, *vfilter;

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_sse2\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,

-                                  src_pixels_per_line, 1, 9, 16, hfilter);

-      vfilter = vp9_six_tap_mmx[yoffset];

-      vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,

-                                  16, 8, 4, dst_pitch, vfilter);

-    } else {

-      /* First-pass only */

-      hfilter = vp9_six_tap_mmx[xoffset];

-      vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,

-                                       dst_ptr, dst_pitch, 4, hfilter);

-    }

-  } else {

-    /* Second-pass only */

-    vfilter = vp9_six_tap_mmx[yoffset];

-    vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),

-                                     src_pixels_per_line,

-                                     dst_ptr, dst_pitch, 4, vfilter);

-  }

-}

-#endif

 #if HAVE_SSSE3

-extern void vp9_filter_block1d8_h6_ssse3(unsigned char  *src_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned char  *output_ptr,

-                                         unsigned int    output_pitch,

-                                         unsigned int    output_height,

-                                         unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d16_h6_ssse3(unsigned char  *src_ptr,

-                                          unsigned int    src_pixels_per_line,

-                                          unsigned char  *output_ptr,

-                                          unsigned int    output_pitch,

-                                          unsigned int    output_height,

-                                          unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,

-                                          unsigned int   src_pitch,

-                                          unsigned char *output_ptr,

-                                          unsigned int   out_pitch,

-                                          unsigned int   output_height,

-                                          unsigned int   vp9_filter_index);

-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,

-                                         unsigned int   src_pitch,

-                                         unsigned char *output_ptr,

-                                         unsigned int   out_pitch,

-                                         unsigned int   output_height,

-                                         unsigned int   vp9_filter_index);

-extern void vp9_filter_block1d4_h6_ssse3(unsigned char  *src_ptr,

-                                         unsigned int    src_pixels_per_line,

-                                         unsigned char  *output_ptr,

-                                         unsigned int    output_pitch,

-                                         unsigned int    output_height,

-                                         unsigned int    vp9_filter_index);

-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,

-                                         unsigned int   src_pitch,

-                                         unsigned char *output_ptr,

-                                         unsigned int   out_pitch,

-                                         unsigned int   output_height,

-                                         unsigned int   vp9_filter_index);

-void vp9_sixtap_predict16x16_ssse3(unsigned char  *src_ptr,

-                                   int  src_pixels_per_line,

-                                   int  xoffset,

-                                   int  yoffset,

-                                   unsigned char *dst_ptr,

-                                   int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict16x16_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                    src_pixels_per_line,

-                                    fdata2, 16, 21, xoffset);

-      vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,

-                                    16, yoffset);

-    } else {

-      /* First-pass only */

-      vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,

-                                    dst_ptr, dst_pitch, 16, xoffset);

-    }

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                  src_pixels_per_line,

-                                  dst_ptr, dst_pitch, 16, yoffset);

-  }

-}

-void vp9_sixtap_predict8x8_ssse3(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x8_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 8, 13, xoffset);

-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);

-    } else {

-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 8, xoffset);

-    }

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 8, yoffset);

-  }

-}

-void vp9_sixtap_predict8x4_ssse3(unsigned char  *src_ptr,

-                                 int  src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int  dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict8x4_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 8, 9, xoffset);

-      vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);

-    } else {

-      /* First-pass only */

-      vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 4, xoffset);

-    }

-  } else {

-    /* Second-pass only */

-    vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 4, yoffset);

-  }

-}

-void vp9_sixtap_predict4x4_ssse3(unsigned char  *src_ptr,

-                                 int   src_pixels_per_line,

-                                 int  xoffset,

-                                 int  yoffset,

-                                 unsigned char *dst_ptr,

-                                 int dst_pitch) {

-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);

-#ifdef ANNOUNCE_FUNCTION

-  printf("vp9_sixtap_predict4x4_ssse3\n");

-#endif

-  if (xoffset) {

-    if (yoffset) {

-      vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                   src_pixels_per_line, fdata2, 4, 9, xoffset);

-      vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);

-    } else {

-      vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,

-                                   dst_ptr, dst_pitch, 4, xoffset);

-    }

-  } else {

-    vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),

-                                 src_pixels_per_line,

-                                 dst_ptr, dst_pitch, 4, yoffset);

-  }

-}

 void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,

                                    const unsigned int src_pitch,

                                    unsigned char *output_ptr,

@@ -513,30 +51,6 @@

                                    unsigned int output_height,

                                    const short *filter);

-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,

-                                      const unsigned int src_stride,

-                                      const short *hfilter_aligned16,

-                                      const short *vfilter_aligned16,

-                                      unsigned char *dst_ptr,

-                                      unsigned int dst_stride) {

-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

-    vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                  fdata2, 16, 23, hfilter_aligned16);

-    vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,

-                                  vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,

-                                    16, hfilter_aligned16);

-    } else {

-      vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                    dst_ptr, dst_stride, 16, vfilter_aligned16);

-    }

-  }

-}

 void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,

                                    const unsigned int src_pitch,

                                    unsigned char *output_ptr,

@@ -551,51 +65,100 @@

                                    unsigned int output_height,

                                    const short *filter);

-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,

-                                    const unsigned int src_stride,

-                                    const short *hfilter_aligned16,

-                                    const short *vfilter_aligned16,

-                                    unsigned char *dst_ptr,

-                                    unsigned int dst_stride) {

-  if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {

-    DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

+void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,

+                               uint8_t *dst, int dst_stride,

+                               const int16_t *filter_x, int x_step_q4,

+                               const int16_t *filter_y, int y_step_q4,

+                               int w, int h) {

+  if (x_step_q4 == 16 && filter_x[3] != 128) {

+    while (w >= 16) {

+      vp9_filter_block1d16_h8_ssse3(src, src_stride,

+                                    dst, dst_stride,

+                                    h, filter_x);

+      src += 16;

+      dst += 16;

+      w -= 16;

+    }

+    while (w >= 8) {

+      vp9_filter_block1d8_h8_ssse3(src, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_x);

+      src += 8;

+      dst += 8;

+      w -= 8;

+    }

+  }

+  if (w) {

+    vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,

+                          filter_x, x_step_q4, filter_y, y_step_q4,

+                          w, h);

+  }

+}

-    vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                 fdata2, 16, 15, hfilter_aligned16);

-    vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,

-                                 vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,

-                                   hfilter_aligned16);

-    } else {

-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   dst_ptr, dst_stride, 8, vfilter_aligned16);

+void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,

+                              uint8_t *dst, int dst_stride,

+                              const int16_t *filter_x, int x_step_q4,

+                              const int16_t *filter_y, int y_step_q4,

+                              int w, int h) {

+  if (y_step_q4 == 16 && filter_y[3] != 128) {

+    while (w >= 16) {

+      vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,

+                                    dst, dst_stride,

+                                    h, filter_y);

+      src += 16;

+      dst += 16;

+      w -= 16;

+    while (w >= 8) {

+      vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      src += 8;

+      dst += 8;

+      w -= 8;

+    }

+  if (w) {

+    vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,

+                         filter_x, x_step_q4, filter_y, y_step_q4,

+                         w, h);

+  }

-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,

-                                    const unsigned int src_stride,

-                                    const short *hfilter_aligned16,

-                                    const short *vfilter_aligned16,

-                                    unsigned char *dst_ptr,

-                                    unsigned int dst_stride) {

-  if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {

-      DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);

+void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,

+                         uint8_t *dst, int dst_stride,

+                         const int16_t *filter_x, int x_step_q4,

+                         const int16_t *filter_y, int y_step_q4,

+                         int w, int h) {

+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);

-      vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   fdata2, 16, 11, hfilter_aligned16);

-      vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,

-                                   vfilter_aligned16);

-  } else {

-    if (hfilter_aligned16[3] != 128) {

-      vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,

-                                   hfilter_aligned16);

-    } else {

-      vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,

-                                   dst_ptr, dst_stride, 4, vfilter_aligned16);

+  // check w/h due to fixed size fdata2 array

+  assert(w <= 16);

+  assert(h <= 16);

+  if (x_step_q4 == 16 && y_step_q4 == 16 &&

+      filter_x[3] != 128 && filter_y[3] != 128) {

+    if (w == 16) {

+      vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,

+                                    fdata2, 16,

+                                    h + 7, filter_x);

+      vp9_filter_block1d16_v8_ssse3(fdata2, 16,

+                                    dst, dst_stride,

+                                    h, filter_y);

+      return;

+    if (w == 8) {

+      vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,

+                                   fdata2, 16,

+                                   h + 7, filter_x);

+      vp9_filter_block1d8_v8_ssse3(fdata2, 16,

+                                   dst, dst_stride,

+                                   h, filter_y);

+      return;

+    }

+  vp9_convolve8_c(src, src_stride, dst, dst_stride,

+                  filter_x, x_step_q4, filter_y, y_step_q4,

+                  w, h);

 #endif

--- a/vp9/common/x86/vp9_filter_sse2.c

+++ /dev/null

@@ -1,290 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h> // for alignment checks

-#include <emmintrin.h> // SSE2

-#include "vp9/common/vp9_filter.h"

-#include "vpx_ports/emmintrin_compat.h"

-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

-#include "vp9_rtcd.h"

-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

-//           just a quick partial snapshot so that other can already use some

-//           speedup.

-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

-//           filtering.

-// TODO(cd): Add some comments, better variable naming.

-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

-//           of positive above 128), or have higher precision filter

-//           coefficients.

-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-};

-// Creating a macro to do more than four pixels at once to hide instruction

-// latency is actually slower :-(

-#define DO_FOUR_PIXELS(result, src_ptr, offset)                                \

-  {                                                                            \

-  /* Do shifted load to achieve require shuffles through unpacking */          \

-  const __m128i src0  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \

-  const __m128i src1  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \

-  const __m128i src2  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \

-  const __m128i src3  = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \

-  const __m128i src01 = _mm_unpacklo_epi8(src0, src1);                         \

-  const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero);                     \

-  const __m128i src23 = _mm_unpacklo_epi8(src2, src3);                         \

-  const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero);                     \

-  /* Shit by 4 bytes through suffle to get additional shifted loads */         \

-  const __m128i src4  = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src5  = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src6  = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src7  = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1));      \

-  const __m128i src45 = _mm_unpacklo_epi8(src4, src5);                         \

-  const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero);                     \

-  const __m128i src67 = _mm_unpacklo_epi8(src6, src7);                         \

-  const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero);                     \

-  /* multiply accumulate them */                                               \

-  const __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                       \

-  const __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                       \

-  const __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                       \

-  const __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                       \

-  const __m128i mad0123 = _mm_add_epi32(mad01, mad23);                         \

-  const __m128i mad4567 = _mm_add_epi32(mad45, mad67);                         \

-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

-  }

-void vp9_filter_block2d_4x4_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  __m128i intermediateA, intermediateB, intermediateC;

-  const int kInterp_Extend = 4;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

-  // check alignment

-  assert(0 == ((long)HFilter_aligned16)%16);

-  assert(0 == ((long)VFilter_aligned16)%16);

-  {

-    __m128i transpose3_0;

-    __m128i transpose3_1;

-    __m128i transpose3_2;

-    __m128i transpose3_3;

-    // Horizontal pass (src -> intermediate).

-    {

-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-      {

-        __m128i mad_all0;

-        __m128i mad_all1;

-        __m128i mad_all2;

-        __m128i mad_all3;

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

-      }

-    }

-    // Transpose result (intermediate -> transpose3_x)

-    {

-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

-      const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);

-      const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);

-      const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);

-      const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);

-      // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53

-      // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73

-      // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx

-      // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx

-      const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);

-      const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);

-      const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);

-      const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);

-      // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63

-      // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73

-      // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx

-      // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx

-      const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);

-      const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);

-      const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);

-      const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);

-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

-      // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx

-      // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx

-      transpose3_0 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose2_2),

-                                           _MM_SHUFFLE(1, 0, 1, 0)));

-      transpose3_1 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose2_2),

-                                           _MM_SHUFFLE(3, 2, 3, 2)));

-      transpose3_2 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose2_3),

-                                           _MM_SHUFFLE(1, 0, 1, 0)));

-      transpose3_3 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose2_3),

-                                           _MM_SHUFFLE(3, 2, 3, 2)));

-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

-    }

-    // Vertical pass (transpose3_x -> dst).

-    {

-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      __m128i col0, col1, col2, col3;

-        DECLARE_ALIGNED(16, unsigned char, temp[32]);

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_0);

-        DO_FOUR_PIXELS(col0, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_1);

-        DO_FOUR_PIXELS(col1, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_2);

-        DO_FOUR_PIXELS(col2, temp, 0);

-      }

-      {

-        _mm_store_si128((__m128i *)temp, transpose3_3);

-        DO_FOUR_PIXELS(col3, temp, 0);

-      }

-      // transpose

-      {

-        __m128i T0 = _mm_unpacklo_epi32(col0, col1);

-        __m128i T1 = _mm_unpacklo_epi32(col2, col3);

-        __m128i T2 = _mm_unpackhi_epi32(col0, col1);

-        __m128i T3 = _mm_unpackhi_epi32(col2, col3);

-        col0 = _mm_unpacklo_epi64(T0, T1);

-        col1 = _mm_unpackhi_epi64(T0, T1);

-        col2 = _mm_unpacklo_epi64(T2, T3);

-        col3 = _mm_unpackhi_epi64(T2, T3);

-      }

-      // saturate to 8 bit

-      {

-        col0 = _mm_packs_epi32(col0, col0);

-        col0 = _mm_packus_epi16(col0, col0);

-        col1 = _mm_packs_epi32(col1, col1);

-        col1 = _mm_packus_epi16(col1, col1);

-        col2 = _mm_packs_epi32 (col2, col2);

-        col2 = _mm_packus_epi16(col2, col2);

-        col3 = _mm_packs_epi32 (col3, col3);

-        col3 = _mm_packus_epi16(col3, col3);

-      }

-      // store

-      {

-        *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);

-        *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);

-        *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);

-        *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);

-      }

-    }

-  }

-}

-void vp9_filter_block2d_8x4_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int j;

-  for (j=0; j<8; j+=4) {

-    vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,

-                                  HFilter_aligned16, VFilter_aligned16,

-                                  dst_ptr + j, dst_stride);

-  }

-}

-void vp9_filter_block2d_8x8_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<8; i+=4) {

-    for (j=0; j<8; j+=4) {

-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

-void vp9_filter_block2d_16x16_8_sse2

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<16; i+=4) {

-    for (j=0; j<16; j+=4) {

-      vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

--- a/vp9/common/x86/vp9_filter_sse4.c

+++ /dev/null

@@ -1,362 +1,0 @@

-/*

- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <assert.h> // for alignment checks

-#include <smmintrin.h> // SSE4.1

-#include "vp9/common/vp9_filter.h"

-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED

-#include "vp9_rtcd.h"

-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is

-//           just a quick partial snapshot so that other can already use some

-//           speedup.

-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap

-//           filtering.

-// TODO(cd): Reduce source size by using macros instead of current code

-//           duplication.

-// TODO(cd): Add some comments, better variable naming.

-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum

-//           of positive above 128), or have higher precision filter

-//           coefficients.

-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {

-  0x00, 0x01,

-  0x01, 0x02,

-  0x02, 0x03,

-  0x03, 0x04,

-  0x02, 0x03,

-  0x03, 0x04,

-  0x04, 0x05,

-  0x05, 0x06,

-};

-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {

-  0x04, 0x05,

-  0x05, 0x06,

-  0x06, 0x07,

-  0x07, 0x08,

-  0x06, 0x07,

-  0x07, 0x08,

-  0x08, 0x09,

-  0x09, 0x0A,

-};

-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-  VP9_FILTER_WEIGHT >> 1,

-};

-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {

-  0, 4,  8, 12,

-  1, 5,  9, 13,

-  2, 6, 10, 14,

-  3, 7, 11, 15

-};

-// Creating a macro to do more than four pixels at once to hide instruction

-// latency is actually slower :-(

-#define DO_FOUR_PIXELS(result, offset)                                         \

-  {                                                                            \

-  /*load pixels*/                                                              \

-  __m128i src  = _mm_loadu_si128((const __m128i *)(src_ptr + offset));         \

-  /* extract the ones used for first column */                                 \

-  __m128i src0123 = _mm_shuffle_epi8(src, mask0123);                           \

-  __m128i src4567 = _mm_shuffle_epi8(src, mask4567);                           \

-  __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);                         \

-  __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);                         \

-  __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);                         \

-  __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);                         \

-  /* multiply accumulate them */                                               \

-  __m128i mad01 = _mm_madd_epi16(src01_16, fil01);                             \

-  __m128i mad23 = _mm_madd_epi16(src23_16, fil23);                             \

-  __m128i mad45 = _mm_madd_epi16(src45_16, fil45);                             \

-  __m128i mad67 = _mm_madd_epi16(src67_16, fil67);                             \

-  __m128i mad0123 = _mm_add_epi32(mad01, mad23);                               \

-  __m128i mad4567 = _mm_add_epi32(mad45, mad67);                               \

-  __m128i mad_all = _mm_add_epi32(mad0123, mad4567);                           \

-  mad_all = _mm_add_epi32(mad_all, rounding);                                  \

-  result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);                          \

-  }

-void vp9_filter_block2d_4x4_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  __m128i intermediateA, intermediateB, intermediateC;

-  const int kInterp_Extend = 4;

-  const __m128i zero = _mm_set1_epi16(0);

-  const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);

-  const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);

-  const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);

-  const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);

-  // check alignment

-  assert(0 == ((long)HFilter_aligned16)%16);

-  assert(0 == ((long)VFilter_aligned16)%16);

-  {

-    __m128i transpose3_0;

-    __m128i transpose3_1;

-    __m128i transpose3_2;

-    __m128i transpose3_3;

-    // Horizontal pass (src -> intermediate).

-    {

-      const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);

-      {

-        __m128i mad_all0;

-        __m128i mad_all1;

-        __m128i mad_all2;

-        __m128i mad_all3;

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateA = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        DO_FOUR_PIXELS(mad_all3, 3*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);

-        intermediateB = _mm_packus_epi16(mad_all0, mad_all2);

-        // --

-        src_ptr += src_stride*4;

-        // --

-        DO_FOUR_PIXELS(mad_all0, 0*src_stride)

-        DO_FOUR_PIXELS(mad_all1, 1*src_stride)

-        DO_FOUR_PIXELS(mad_all2, 2*src_stride)

-        mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);

-        mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);

-        intermediateC = _mm_packus_epi16(mad_all0, mad_all2);

-      }

-    }

-    // Transpose result (intermediate -> transpose3_x)

-    {

-      // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33

-      // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73

-      // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx

-      const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);

-      const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);

-      const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);

-      // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33

-      // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73

-      // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx

-      const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);

-      const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);

-      // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71

-      // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73

-      transpose3_0 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(0, 0, 1, 0)));

-      transpose3_1 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(1, 1, 3, 2)));

-      transpose3_2 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(2, 2, 1, 0)));

-      transpose3_3 = _mm_castps_si128(

-                            _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),

-                                           _mm_castsi128_ps(transpose1_2),

-                                           _MM_SHUFFLE(3, 3, 3, 2)));

-      // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx

-      // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx

-      // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx

-      // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx

-    }

-    // Vertical pass (transpose3_x -> dst).

-    {

-      const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);

-      // get first two columns filter coefficients

-      __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));

-      __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));

-      __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));

-      __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));

-      __m128i col0, col1, col2, col3;

-      {

-        //load pixels

-        __m128i src  = transpose3_0;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col0 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_1;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col1 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_2;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col2 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        //load pixels

-        __m128i src  = transpose3_3;

-        // extract the ones used for first column

-        __m128i src0123 = _mm_shuffle_epi8(src, mask0123);

-        __m128i src4567 = _mm_shuffle_epi8(src, mask4567);

-        __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);

-        __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);

-        __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);

-        __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);

-        // multiply accumulate them

-        __m128i mad01 = _mm_madd_epi16(src01_16, fil01);

-        __m128i mad23 = _mm_madd_epi16(src23_16, fil23);

-        __m128i mad45 = _mm_madd_epi16(src45_16, fil45);

-        __m128i mad67 = _mm_madd_epi16(src67_16, fil67);

-        __m128i mad0123 = _mm_add_epi32(mad01, mad23);

-        __m128i mad4567 = _mm_add_epi32(mad45, mad67);

-        __m128i mad_all = _mm_add_epi32(mad0123, mad4567);

-        mad_all = _mm_add_epi32(mad_all, rounding);

-        mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);

-        mad_all = _mm_packs_epi32(mad_all, mad_all);

-        col3 = _mm_packus_epi16(mad_all, mad_all);

-      }

-      {

-        __m128i col01 = _mm_unpacklo_epi8(col0, col1);

-        __m128i col23 = _mm_unpacklo_epi8(col2, col3);

-        __m128i col0123 = _mm_unpacklo_epi16(col01, col23);

-        //TODO(cd): look into Ronald's comment:

-        //    Future suggestion: I believe here, too, you can merge the

-        //    packs_epi32() and pacus_epi16() for the 4 cols above, so that

-        //    you get the data in a single register, and then use pshufb

-        //    (shuffle_epi8()) instead of the unpacks here. Should be

-        //    2+3+2 instructions faster.

-        *((unsigned int *)&dst_ptr[dst_stride * 0]) =

-            _mm_extract_epi32(col0123, 0);

-        *((unsigned int *)&dst_ptr[dst_stride * 1]) =

-            _mm_extract_epi32(col0123, 1);

-        *((unsigned int *)&dst_ptr[dst_stride * 2]) =

-            _mm_extract_epi32(col0123, 2);

-        *((unsigned int *)&dst_ptr[dst_stride * 3]) =

-            _mm_extract_epi32(col0123, 3);

-      }

-    }

-  }

-}

-void vp9_filter_block2d_8x4_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int j;

-  for (j=0; j<8; j+=4) {

-    vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,

-                                    HFilter_aligned16, VFilter_aligned16,

-                                    dst_ptr + j, dst_stride);

-  }

-}

-void vp9_filter_block2d_8x8_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<8; i+=4) {

-    for (j=0; j<8; j+=4) {

-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

-                                      HFilter_aligned16, VFilter_aligned16,

-                                      dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

-void vp9_filter_block2d_16x16_8_sse4_1

-(

- const unsigned char *src_ptr, const unsigned int src_stride,

- const short *HFilter_aligned16, const short *VFilter_aligned16,

- unsigned char *dst_ptr, unsigned int dst_stride

-) {

-  int i, j;

-  for (i=0; i<16; i+=4) {

-    for (j=0; j<16; j+=4) {

-      vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,

-                                      HFilter_aligned16, VFilter_aligned16,

-                                      dst_ptr + j + i*dst_stride, dst_stride);

-    }

-  }

-}

--- a/vp9/common/x86/vp9_subpixel_mmx.asm

+++ /dev/null

@@ -1,268 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define vp9_filter_weight 128

-%define VP9_FILTER_SHIFT  7

-;void vp9_filter_block1d_h6_mmx

-;(

-;    unsigned char   *src_ptr,

-;    unsigned short  *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           * vp9_filter

-;)

-global sym(vp9_filter_block1d_h6_mmx) PRIVATE

-sym(vp9_filter_block1d_h6_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,    arg(6) ;vp9_filter

-        movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!

-        movq        mm2,    [rdx + 32]         ;

-        movq        mm6,    [rdx + 48]        ;

-        movq        mm7,    [rdx + 64]        ;

-        mov         rdi,    arg(1) ;output_ptr

-        mov         rsi,    arg(0) ;src_ptr

-        movsxd      rcx,    dword ptr arg(4) ;output_height

-        movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?

-        pxor        mm0,    mm0              ; mm0 = 00000000

-.nextrow:

-        movq        mm3,    [rsi-2]          ; mm3 = p-2..p5

-        movq        mm4,    mm3              ; mm4 = p-2..p5

-        psrlq       mm3,    8                ; mm3 = p-1..p5

-        punpcklbw   mm3,    mm0              ; mm3 = p-1..p2

-        pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.

-        movq        mm5,    mm4              ; mm5 = p-2..p5

-        punpckhbw   mm4,    mm0              ; mm5 = p2..p5

-        pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        movq        mm4,    mm5              ; mm4 = p-2..p5;

-        psrlq       mm5,    16               ; mm5 = p0..p5;

-        punpcklbw   mm5,    mm0              ; mm5 = p0..p3

-        pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers

-        paddsw      mm3,    mm5              ; mm3 += mm5

-        movq        mm5,    mm4              ; mm5 = p-2..p5

-        psrlq       mm4,    24               ; mm4 = p1..p5

-        punpcklbw   mm4,    mm0              ; mm4 = p1..p4

-        pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        ; do outer positive taps

-        movd        mm4,    [rsi+3]

-        punpcklbw   mm4,    mm0              ; mm5 = p3..p6

-        pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers

-        paddsw      mm3,    mm4              ; mm3 += mm5

-        punpcklbw   mm5,    mm0              ; mm5 = p-2..p1

-        pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers

-        paddsw      mm3,    mm5              ; mm3 += mm5

-        paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value

-        psraw       mm3,    VP9_FILTER_SHIFT     ; mm3 /= 128

-        packuswb    mm3,    mm0              ; pack and unpack to saturate

-        punpcklbw   mm3,    mm0              ;

-        movq        [rdi],  mm3              ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line

-        add         rdi,    rax;

-%else

-        movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line

-        add         rdi,    rax;

-        add         rsi,    r8               ; next line

-%endif

-        dec         rcx                      ; decrement count

-        jnz         .nextrow                 ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1dc_v6_mmx

-;(

-;   short *src_ptr,

-;   unsigned char *output_ptr,

-;    int output_pitch,

-;   unsigned int pixels_per_line,

-;   unsigned int pixel_step,

-;   unsigned int output_height,

-;   unsigned int output_width,

-;   short * vp9_filter

-;)

-global sym(vp9_filter_block1dc_v6_mmx) PRIVATE

-sym(vp9_filter_block1dc_v6_mmx):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        movq      mm5, [GLOBAL(rd)]

-        push        rbx

-        mov         rbx, arg(7) ;vp9_filter

-        movq      mm1, [rbx + 16]             ; do both the negative taps first!!!

-        movq      mm2, [rbx + 32]         ;

-        movq      mm6, [rbx + 48]        ;

-        movq      mm7, [rbx + 64]        ;

-        movsxd      rdx, dword ptr arg(3) ;pixels_per_line

-        mov         rdi, arg(1) ;output_ptr

-        mov         rsi, arg(0) ;src_ptr

-        sub         rsi, rdx

-        sub         rsi, rdx

-        movsxd      rcx, DWORD PTR arg(5) ;output_height

-        movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?

-        pxor        mm0, mm0              ; mm0 = 00000000

-.nextrow_cv:

-        movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1

-        pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.

-        movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2

-        pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0

-        pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2

-        pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch

-        movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1

-        pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3

-        pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.

-        paddsw      mm3, mm4              ; mm3 += mm4

-        paddsw      mm3, mm5               ; mm3 += round value

-        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128

-        packuswb    mm3, mm0              ; pack and saturate

-        movd        [rdi],mm3             ; store the results in the destination

-        ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the

-        ; recon block should be in cache this shouldn't cost much.  Its obviously

-        ; avoidable!!!.

-        lea         rdi,  [rdi+rax] ;

-        dec         rcx                   ; decrement count

-        jnz         .nextrow_cv           ; next row

-        pop         rbx

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-rd:

-    times 4 dw 0x40

-align 16

-global HIDDEN_DATA(sym(vp9_six_tap_mmx))

-sym(vp9_six_tap_mmx):

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 128

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw 0

-    times 8 dw -6

-    times 8 dw 123

-    times 8 dw 12

-    times 8 dw -1

-    times 8 dw 0

-    times 8 dw 2

-    times 8 dw -11

-    times 8 dw 108

-    times 8 dw 36

-    times 8 dw -8

-    times 8 dw 1

-    times 8 dw 0

-    times 8 dw -9

-    times 8 dw 93

-    times 8 dw 50

-    times 8 dw -6

-    times 8 dw 0

-    times 8 dw 3

-    times 8 dw -16

-    times 8 dw 77

-    times 8 dw 77

-    times 8 dw -16

-    times 8 dw 3

-    times 8 dw 0

-    times 8 dw -6

-    times 8 dw 50

-    times 8 dw 93

-    times 8 dw -9

-    times 8 dw 0

-    times 8 dw 1

-    times 8 dw -8

-    times 8 dw 36

-    times 8 dw 108

-    times 8 dw -11

-    times 8 dw 2

-    times 8 dw 0

-    times 8 dw -1

-    times 8 dw 12

-    times 8 dw 123

-    times 8 dw -6

-    times 8 dw 0

--- a/vp9/common/x86/vp9_subpixel_sse2.asm

+++ /dev/null

@@ -1,1372 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define VP9_FILTER_WEIGHT 128

-%define VP9_FILTER_SHIFT  7

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;*************************************************************************************/

-;void vp9_filter_block1d8_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           *vp9_filter

-;)

-global sym(vp9_filter_block1d8_h6_sse2) PRIVATE

-sym(vp9_filter_block1d8_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(6) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;output_width

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d8_h6_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm4

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(5) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d8_h6_rowloop                ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    pixel_step,

-;    unsigned int    output_height,

-;    unsigned int    output_width,

-;    short           *vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;*************************************************************************************/

-global sym(vp9_filter_block1d16_h6_sse2) PRIVATE

-sym(vp9_filter_block1d16_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 7

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(6) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;output_width

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d16_h6_sse2_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        movq        xmm2,       MMWORD PTR [rsi +14]

-        pslldq      xmm2,       8

-        por         xmm2,       xmm1

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm4

-        movdqa      xmm3,       xmm2

-        movdqa      xmm4,       xmm2

-        movdqa      xmm5,       xmm2

-        movdqa      xmm6,       xmm2

-        movdqa      xmm7,       xmm2

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm2

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        punpcklbw   xmm4,       xmm0

-        movdqa      XMMWORD Ptr [rdi+16],      xmm4

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(5) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_sse2

-;(

-;    short *src_ptr,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int pixels_per_line,

-;    unsigned int pixel_step,

-;    unsigned int output_height,

-;    unsigned int output_width,

-;    short * vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The

-; input pixel array has output_height rows.

-;*************************************************************************************/

-global sym(vp9_filter_block1d8_v6_sse2) PRIVATE

-sym(vp9_filter_block1d8_v6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rax,        arg(7) ;vp9_filter

-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

-        mov         rdi,        arg(1) ;output_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        sub         rsi,        rdx

-        sub         rsi,        rdx

-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(2) ; dst_ptich

-%endif

-.vp9_filter_block1d8_v6_sse2_loop:

-        movdqa      xmm1,       XMMWORD PTR [rsi]

-        pmullw      xmm1,       [rax]

-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]

-        pmullw      xmm2,       [rax + 16]

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]

-        pmullw      xmm3,       [rax + 32]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]

-        pmullw      xmm5,       [rax + 64]

-        add         rsi,        rdx

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]

-        pmullw      xmm4,       [rax + 48]

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]

-        pmullw      xmm6,       [rax + 80]

-        paddsw      xmm2,       xmm5

-        paddsw      xmm2,       xmm3

-        paddsw      xmm2,       xmm1

-        paddsw      xmm2,       xmm4

-        paddsw      xmm2,       xmm6

-        paddsw      xmm2,       xmm7

-        psraw       xmm2,       7

-        packuswb    xmm2,       xmm0              ; pack and saturate

-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d8_v6_sse2_loop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v6_sse2

-;(

-;    unsigned short *src_ptr,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int pixels_per_line,

-;    unsigned int pixel_step,

-;    unsigned int output_height,

-;    unsigned int output_width,

-;    const short    *vp9_filter

-;)

-;/************************************************************************************

-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The

-; input pixel array has output_height rows.

-;*************************************************************************************/

-global sym(vp9_filter_block1d16_v6_sse2) PRIVATE

-sym(vp9_filter_block1d16_v6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 8

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rax,        arg(7) ;vp9_filter

-        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

-        mov         rdi,        arg(1) ;output_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        sub         rsi,        rdx

-        sub         rsi,        rdx

-        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(2) ; dst_ptich

-%endif

-.vp9_filter_block1d16_v6_sse2_loop:

-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.

-        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2

-        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]

-        pmullw      xmm1,       [rax + 16]

-        pmullw      xmm2,       [rax + 16]

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]

-        pmullw      xmm3,       [rax + 64]

-        pmullw      xmm4,       [rax + 64]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]

-        pmullw      xmm5,       [rax + 32]

-        pmullw      xmm6,       [rax + 32]

-        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1

-        movdqa      xmm0,       XMMWORD PTR [rsi + 16]

-        pmullw      xmm7,       [rax]

-        pmullw      xmm0,       [rax]

-        paddsw      xmm1,       xmm3

-        paddsw      xmm2,       xmm4

-        paddsw      xmm1,       xmm5

-        paddsw      xmm2,       xmm6

-        paddsw      xmm1,       xmm7

-        paddsw      xmm2,       xmm0

-        add         rsi,        rdx

-        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4

-        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]

-        pmullw      xmm3,       [rax + 48]

-        pmullw      xmm4,       [rax + 48]

-        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6

-        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]

-        pmullw      xmm5,       [rax + 80]

-        pmullw      xmm6,       [rax + 80]

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        paddsw      xmm1,       xmm3

-        paddsw      xmm2,       xmm4

-        paddsw      xmm1,       xmm5

-        paddsw      xmm2,       xmm6

-        paddsw      xmm1,       xmm7

-        paddsw      xmm2,       xmm7

-        psraw       xmm1,       7

-        psraw       xmm2,       7

-        packuswb    xmm1,       xmm2              ; pack and saturate

-        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d16_v6_sse2_loop              ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_h6_only_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    int dst_ptich,

-;    unsigned int    output_height,

-;    const short    *vp9_filter

-;)

-; First-pass filter only when yoffset==0

-global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE

-sym(vp9_filter_block1d8_h6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(5) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ;dst_ptich

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d8_h6_only_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0

-        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d8_h6_only_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_only_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    int dst_ptich,

-;    unsigned int    output_height,

-;    const short    *vp9_filter

-;)

-; First-pass filter only when yoffset==0

-global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE

-sym(vp9_filter_block1d16_h6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rdx,        arg(5) ;vp9_filter

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ;dst_ptich

-%endif

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-.filter_block1d16_h6_only_sse2_rowloop:

-        movq        xmm3,       MMWORD PTR [rsi - 2]

-        movq        xmm1,       MMWORD PTR [rsi + 6]

-        movq        xmm2,       MMWORD PTR [rsi +14]

-        pslldq      xmm2,       8

-        por         xmm2,       xmm1

-        prefetcht2  [rsi+rax-2]

-        pslldq      xmm1,       8

-        por         xmm1,       xmm3

-        movdqa      xmm4,       xmm1

-        movdqa      xmm5,       xmm1

-        movdqa      xmm6,       xmm1

-        movdqa      xmm7,       xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm1

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0                        ; lower 8 bytes

-        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination

-        movdqa      xmm3,       xmm2

-        movdqa      xmm4,       xmm2

-        movdqa      xmm5,       xmm2

-        movdqa      xmm6,       xmm2

-        movdqa      xmm7,       xmm2

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

-        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

-        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

-        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

-        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

-        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

-        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

-        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

-        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

-        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

-        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

-        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

-        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

-        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

-        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

-        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

-        paddsw      xmm4,       xmm7

-        paddsw      xmm4,       xmm5

-        paddsw      xmm4,       xmm3

-        paddsw      xmm4,       xmm6

-        paddsw      xmm4,       xmm2

-        paddsw      xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       7

-        packuswb    xmm4,       xmm0                        ; higher 8 bytes

-        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(3) ;dst_ptich

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_only_sse2

-;(

-;    unsigned char *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char *output_ptr,

-;    int dst_ptich,

-;    unsigned int output_height,

-;    const short    *vp9_filter

-;)

-; Second-pass filter only when xoffset==0

-global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE

-sym(vp9_filter_block1d8_v6_only_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(2) ;output_ptr

-        movsxd      rcx,        dword ptr arg(4) ;output_height

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-        mov         rax,        arg(5) ;vp9_filter

-        pxor        xmm0,       xmm0                        ; clear xmm0

-        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(3) ; dst_ptich

-%endif

-.vp9_filter_block1d8_v6_only_sse2_loop:

-        movq        xmm1,       MMWORD PTR [rsi]

-        movq        xmm2,       MMWORD PTR [rsi + rdx]

-        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]

-        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]

-        add         rsi,        rdx

-        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]

-        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]

-        punpcklbw   xmm1,       xmm0

-        pmullw      xmm1,       [rax]

-        punpcklbw   xmm2,       xmm0

-        pmullw      xmm2,       [rax + 16]

-        punpcklbw   xmm3,       xmm0

-        pmullw      xmm3,       [rax + 32]

-        punpcklbw   xmm5,       xmm0

-        pmullw      xmm5,       [rax + 64]

-        punpcklbw   xmm4,       xmm0

-        pmullw      xmm4,       [rax + 48]

-        punpcklbw   xmm6,       xmm0

-        pmullw      xmm6,       [rax + 80]

-        paddsw      xmm2,       xmm5

-        paddsw      xmm2,       xmm3

-        paddsw      xmm2,       xmm1

-        paddsw      xmm2,       xmm4

-        paddsw      xmm2,       xmm6

-        paddsw      xmm2,       xmm7

-        psraw       xmm2,       7

-        packuswb    xmm2,       xmm0              ; pack and saturate

-        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx         ; decrement count

-        jnz         .vp9_filter_block1d8_v6_only_sse2_loop              ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_unpack_block1d16_h6_sse2

-;(

-;    unsigned char  *src_ptr,

-;    unsigned short *output_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned int    output_height,

-;    unsigned int    output_width

-;)

-global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE

-sym(vp9_unpack_block1d16_h6_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 5

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        mov         rsi,        arg(0) ;src_ptr

-        mov         rdi,        arg(1) ;output_ptr

-        movsxd      rcx,        dword ptr arg(3) ;output_height

-        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

-        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source

-%endif

-.unpack_block1d16_h6_sse2_rowloop:

-        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2

-        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1

-        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

-        punpcklbw   xmm1,       xmm0

-        movdqa      XMMWORD Ptr [rdi],         xmm1

-        movdqa      XMMWORD Ptr [rdi + 16],    xmm3

-        lea         rsi,        [rsi + rax]

-%if ABI_IS_32BIT

-        add         rdi,        DWORD Ptr arg(4) ;[output_width]

-%else

-        add         rdi,        r8

-%endif

-        dec         rcx

-        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict16x16_sse2

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-extern sym(vp9_bilinear_filters_mmx)

-global sym(vp9_bilinear_predict16x16_sse2) PRIVATE

-sym(vp9_bilinear_predict16x16_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ;const short *HFilter = bilinear_filters_mmx[xoffset]

-    ;const short *VFilter = bilinear_filters_mmx[yoffset]

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        cmp         rax,        0      ;skip first_pass filter if xoffset=0

-        je          .b16x16_sp_only

-        shl         rax,        5

-        add         rax,        rcx    ;HFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        cmp         rax,        0      ;skip second_pass filter if yoffset=0

-        je          .b16x16_fp_only

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5) ;dst_pitch

-%endif

-        ; get the first horizontal line done

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4

-        add         rsi,        rdx                 ; next line

-.next_row:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        movdqa      xmm5,       xmm7

-        movdqa      xmm6,       xmm7

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       [rax]

-        pmullw      xmm6,       [rax]

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4

-        pmullw      xmm3,       [rax+16]

-        pmullw      xmm4,       [rax+16]

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rdx                 ; next line

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(5) ;dst_pitch

-%else

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done

-.b16x16_sp_only:

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-        ; get the first horizontal line done

-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        add         rsi,        rax                 ; next line

-.next_row_spo:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm5,       xmm7

-        movdqa      xmm6,       xmm7

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        movdqa      xmm7,       xmm3

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm5,       xmm1

-        pmullw      xmm6,       xmm1

-        pmullw      xmm3,       xmm2

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rax                 ; next line

-        add         rdi,        rdx                 ;dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_spo

-        jmp         .done

-.b16x16_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

-        pxor        xmm0,       xmm0

-.next_row_fpo:

-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

-        punpckhbw   xmm4,       xmm0

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm1

-        movdqu      xmm5,       [rsi+1]

-        movdqa      xmm6,       xmm5

-        punpcklbw   xmm5,       xmm0

-        punpckhbw   xmm6,       xmm0

-        pmullw      xmm5,       xmm2

-        pmullw      xmm6,       xmm2

-        paddw       xmm3,       xmm5

-        paddw       xmm4,       xmm6

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm4

-        movdqa      [rdi],      xmm3                 ; store the results in the destination

-        add         rsi,        rax                 ; next line

-        add         rdi,        rdx                 ; dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_fpo

-.done:

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict8x8_sse2

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-extern sym(vp9_bilinear_filters_mmx)

-global sym(vp9_bilinear_predict8x8_sse2) PRIVATE

-sym(vp9_bilinear_predict8x8_sse2):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 144                         ; reserve 144 bytes

-    ;const short *HFilter = bilinear_filters_mmx[xoffset]

-    ;const short *VFilter = bilinear_filters_mmx[yoffset]

-        lea         rcx,        [GLOBAL(sym(vp9_bilinear_filters_mmx))]

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-    ;Read 9-line unaligned data in and put them on stack. This gives a big

-    ;performance boost.

-        movdqu      xmm0,       [rsi]

-        lea         rax,        [rdx + rdx*2]

-        movdqu      xmm1,       [rsi+rdx]

-        movdqu      xmm2,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm3,       [rsi]

-        movdqu      xmm4,       [rsi+rdx]

-        movdqu      xmm5,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm6,       [rsi]

-        movdqu      xmm7,       [rsi+rdx]

-        movdqa      XMMWORD PTR [rsp],            xmm0

-        movdqu      xmm0,       [rsi+rdx*2]

-        movdqa      XMMWORD PTR [rsp+16],         xmm1

-        movdqa      XMMWORD PTR [rsp+32],         xmm2

-        movdqa      XMMWORD PTR [rsp+48],         xmm3

-        movdqa      XMMWORD PTR [rsp+64],         xmm4

-        movdqa      XMMWORD PTR [rsp+80],         xmm5

-        movdqa      XMMWORD PTR [rsp+96],         xmm6

-        movdqa      XMMWORD PTR [rsp+112],        xmm7

-        movdqa      XMMWORD PTR [rsp+128],        xmm0

-        movsxd      rax,        dword ptr arg(2) ;xoffset

-        shl         rax,        5

-        add         rax,        rcx    ;HFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch

-        movdqa      xmm1,       [rax]

-        movdqa      xmm2,       [rax+16]

-        movsxd      rax,        dword ptr arg(3) ;yoffset

-        shl         rax,        5

-        add         rax,        rcx    ;VFilter

-        lea         rcx,        [rdi+rdx*8]

-        movdqa      xmm5,       [rax]

-        movdqa      xmm6,       [rax+16]

-        pxor        xmm0,       xmm0

-        ; get the first horizontal line done

-        movdqa      xmm3,       XMMWORD PTR [rsp]

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        psrldq      xmm4,       1

-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm4

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movdqa      xmm7,       xmm3

-        add         rsp,        16                 ; next line

-.next_row8x8:

-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        movdqa      xmm4,       xmm3                 ; make a copy of current line

-        psrldq      xmm4,       1

-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

-        pmullw      xmm3,       xmm1

-        pmullw      xmm4,       xmm2

-        paddw       xmm3,       xmm4

-        pmullw      xmm7,       xmm5

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        movdqa      xmm4,       xmm3

-        pmullw      xmm3,       xmm6

-        paddw       xmm3,       xmm7

-        movdqa      xmm7,       xmm4

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT        ; xmm3 /= 128

-        packuswb    xmm3,       xmm0

-        movq        [rdi],      xmm3                 ; store the results in the destination

-        add         rsp,        16                 ; next line

-        add         rdi,        rdx

-        cmp         rdi,        rcx

-        jne         .next_row8x8

-    ;add rsp, 144

-    pop rsp

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-rd:

-    times 8 dw 0x40

--- a/vp9/common/x86/vp9_subpixel_ssse3.asm

+++ /dev/null

@@ -1,1515 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "vpx_ports/x86_abi_support.asm"

-%define BLOCK_HEIGHT_WIDTH 4

-%define VP9_FILTER_WEIGHT 128

-%define VP9_FILTER_SHIFT  7

-;/************************************************************************************

-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

-; input pixel array has output_height rows. This routine assumes that output_height is an

-; even number. This function handles 8 pixels in horizontal direction, calculating ONE

-; rows each iteration to take advantage of the 128 bits operations.

-;

-; This is an implementation of some of the SSE optimizations first seen in ffvp8

-;

-;*************************************************************************************/

-;void vp9_filter_block1d8_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE

-sym(vp9_filter_block1d8_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4

-    movdqa      xmm7, [GLOBAL(rd)]

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    mov         rdi, arg(2)             ;output_ptr

-    cmp         esi, DWORD PTR [rax]

-    je          vp9_filter_block1d8_h4_ssse3

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-    sub         rdi, rdx

-;xmm3 free

-.filter_block1d8_h6_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm1,   xmm0

-    pmaddubsw   xmm0,   xmm4

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    pmaddubsw   xmm1,   xmm5

-    lea         rdi,    [rdi + rdx]

-    pmaddubsw   xmm2,   xmm6

-    lea         rsi,    [rsi + rax]

-    dec         rcx

-    paddsw      xmm0,   xmm1

-    paddsw      xmm2,   xmm7

-    paddsw      xmm0,   xmm2

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    movq        MMWORD Ptr [rdi], xmm0

-    jnz         .filter_block1d8_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-vp9_filter_block1d8_h4_ssse3:

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]

-    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-    sub         rdi, rdx

-.filter_block1d8_h4_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm2,   xmm0

-    pshufb      xmm0,   xmm3

-    pshufb      xmm2,   xmm4

-    pmaddubsw   xmm0,   xmm5

-    lea         rdi,    [rdi + rdx]

-    pmaddubsw   xmm2,   xmm6

-    lea         rsi,    [rsi + rax]

-    dec         rcx

-    paddsw      xmm0,   xmm7

-    paddsw      xmm0,   xmm2

-    psraw       xmm0,   7

-    packuswb    xmm0,   xmm0

-    movq        MMWORD Ptr [rdi], xmm0

-    jnz         .filter_block1d8_h4_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE

-sym(vp9_filter_block1d16_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)           ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    mov         rdi, arg(2)                     ;output_ptr

-    mov         rsi, arg(0)                     ;src_ptr

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)           ;output_height

-    movsxd      rdx, dword ptr arg(3)           ;output_pitch

-.filter_block1d16_h6_rowloop_ssse3:

-    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5

-    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10

-    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10

-    movdqa      xmm1,   xmm0

-    pmaddubsw   xmm0,   xmm4

-    movdqa      xmm2,   xmm1

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    movq        xmm3,   MMWORD PTR [rsi +  6]

-    pmaddubsw   xmm1,   xmm5

-    movq        xmm7,   MMWORD PTR [rsi + 11]

-    pmaddubsw   xmm2,   xmm6

-    punpcklbw   xmm3,   xmm7

-    paddsw      xmm0,   xmm1

-    movdqa      xmm1,   xmm3

-    pmaddubsw   xmm3,   xmm4

-    paddsw      xmm0,   xmm2

-    movdqa      xmm2,   xmm1

-    paddsw      xmm0,   [GLOBAL(rd)]

-    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]

-    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]

-    psraw       xmm0,   7

-    pmaddubsw   xmm1,   xmm5

-    pmaddubsw   xmm2,   xmm6

-    packuswb    xmm0,   xmm0

-    lea         rsi,    [rsi + rax]

-    paddsw      xmm3,   xmm1

-    paddsw      xmm3,   xmm2

-    paddsw      xmm3,   [GLOBAL(rd)]

-    psraw       xmm3,   7

-    packuswb    xmm3,   xmm3

-    punpcklqdq  xmm0,   xmm3

-    movdqa      XMMWORD Ptr [rdi], xmm0

-    lea         rdi,    [rdi + rdx]

-    dec         rcx

-    jnz         .filter_block1d16_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d4_h6_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    unsigned int    src_pixels_per_line,

-;    unsigned char  *output_ptr,

-;    unsigned int    output_pitch,

-;    unsigned int    output_height,

-;    unsigned int    vp9_filter_index

-;)

-global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE

-sym(vp9_filter_block1d4_h6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movdqa      xmm7, [GLOBAL(rd)]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d4_h4_ssse3

-    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rdi, arg(2)             ;output_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-;xmm3 free

-.filter_block1d4_h6_rowloop_ssse3:

-    movdqu      xmm0,   XMMWORD PTR [rsi - 2]

-    movdqa      xmm1, xmm0

-    pshufb      xmm0, [GLOBAL(shuf1b)]

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, [GLOBAL(shuf2b)]

-    pmaddubsw   xmm0, xmm4

-    pshufb      xmm2, [GLOBAL(shuf3b)]

-    pmaddubsw   xmm1, xmm5

-;--

-    pmaddubsw   xmm2, xmm6

-    lea         rsi,    [rsi + rax]

-;--

-    paddsw      xmm0, xmm1

-    paddsw      xmm0, xmm7

-    pxor        xmm1, xmm1

-    paddsw      xmm0, xmm2

-    psraw       xmm0, 7

-    packuswb    xmm0, xmm0

-    movd        DWORD PTR [rdi], xmm0

-    add         rdi, rdx

-    dec         rcx

-    jnz         .filter_block1d4_h6_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d4_h4_ssse3:

-    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]

-    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rdi, arg(2)             ;output_ptr

-    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line

-    movsxd      rcx, dword ptr arg(4)   ;output_height

-    movsxd      rdx, dword ptr arg(3)   ;output_pitch

-.filter_block1d4_h4_rowloop_ssse3:

-    movdqu      xmm1,   XMMWORD PTR [rsi - 2]

-    movdqa      xmm2, xmm1

-    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]

-    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]

-    pmaddubsw   xmm1, xmm5

-;--

-    pmaddubsw   xmm2, xmm6

-    lea         rsi,    [rsi + rax]

-;--

-    paddsw      xmm1, xmm7

-    paddsw      xmm1, xmm2

-    psraw       xmm1, 7

-    packuswb    xmm1, xmm1

-    movd        DWORD PTR [rdi], xmm1

-    add         rdi, rdx

-    dec         rcx

-    jnz         .filter_block1d4_h4_rowloop_ssse3

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d16_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE

-sym(vp9_filter_block1d16_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d16_v4_ssse3

-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)   ;output_height

-    add         rax, rdx

-.vp9_filter_block1d16_v6_ssse3_loop:

-    movq        xmm1, MMWORD PTR [rsi]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, [GLOBAL(rd)]

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2          ;store the results

-    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, [GLOBAL(rd)]

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi+8], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;out_pitch

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d16_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d16_v4_ssse3:

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ;out_pitch

-%endif

-    mov         rax, rsi

-    movsxd      rcx, DWORD PTR arg(4)   ;output_height

-    add         rax, rdx

-.vp9_filter_block1d16_v4_ssse3_loop:

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    pmaddubsw   xmm3, xmm6

-    pmaddubsw   xmm2, xmm7

-    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B

-    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E

-    paddsw      xmm2, [GLOBAL(rd)]

-    paddsw      xmm2, xmm3

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    punpcklbw   xmm5, xmm4                  ;B D

-    punpcklbw   xmm1, xmm0                  ;C E

-    pmaddubsw   xmm1, xmm6

-    pmaddubsw   xmm5, xmm7

-    movdqa      xmm4, [GLOBAL(rd)]

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm5, xmm1

-    paddsw      xmm5, xmm4

-    psraw       xmm5, 7

-    packuswb    xmm5, xmm5

-    punpcklqdq  xmm2, xmm5

-    movdqa       XMMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;out_pitch

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d16_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d8_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE

-sym(vp9_filter_block1d8_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

-%endif

-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d8_v4_ssse3

-    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d8_v6_ssse3_loop:

-    movq        xmm1, MMWORD PTR [rsi]                  ;A

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F

-    movdqa      xmm4, [GLOBAL(rd)]

-    pmaddubsw   xmm3, xmm6

-    punpcklbw   xmm1, xmm0                  ;A F

-    pmaddubsw   xmm2, xmm7

-    pmaddubsw   xmm1, xmm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm1

-    paddsw      xmm2, xmm4

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d8_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d8_v4_ssse3:

-    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4

-    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3

-    movdqa      xmm5, [GLOBAL(rd)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d8_v4_ssse3_loop:

-    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B

-    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C

-    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D

-    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   xmm2, xmm4                  ;B D

-    punpcklbw   xmm3, xmm0                  ;C E

-    pmaddubsw   xmm3, xmm6

-    pmaddubsw   xmm2, xmm7

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      xmm2, xmm3

-    paddsw      xmm2, xmm5

-    psraw       xmm2, 7

-    packuswb    xmm2, xmm2

-    movq        MMWORD PTR [rdi], xmm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d8_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_filter_block1d4_v6_ssse3

-;(

-;    unsigned char *src_ptr,

-;    unsigned int   src_pitch,

-;    unsigned char *output_ptr,

-;    unsigned int   out_pitch,

-;    unsigned int   output_height,

-;    unsigned int   vp9_filter_index

-;)

-global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE

-sym(vp9_filter_block1d4_v6_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    movsxd      rdx, DWORD PTR arg(5)   ;table index

-    xor         rsi, rsi

-    shl         rdx, 4      ;

-    lea         rax, [GLOBAL(k0_k5)]

-    add         rax, rdx

-    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line

-    mov         rdi, arg(2)             ;output_ptr

-%if ABI_IS_32BIT=0

-    movsxd      r8, DWORD PTR arg(3)    ; out_pitch

-%endif

-    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]

-    cmp         esi, DWORD PTR [rax]

-    je          .vp9_filter_block1d4_v4_ssse3

-    movq        mm5, MMWORD PTR [rax]         ;k0_k5

-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d4_v6_ssse3_loop:

-    movd        mm1, DWORD PTR [rsi]                  ;A

-    movd        mm2, DWORD PTR [rsi + rdx]            ;B

-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   mm2, mm4                  ;B D

-    punpcklbw   mm3, mm0                  ;C E

-    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F

-    movq        mm4, [GLOBAL(rd)]

-    pmaddubsw   mm3, mm6

-    punpcklbw   mm1, mm0                  ;A F

-    pmaddubsw   mm2, mm7

-    pmaddubsw   mm1, mm5

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      mm2, mm3

-    paddsw      mm2, mm1

-    paddsw      mm2, mm4

-    psraw       mm2, 7

-    packuswb    mm2, mm2

-    movd        DWORD PTR [rdi], mm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d4_v6_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-.vp9_filter_block1d4_v4_ssse3:

-    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4

-    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3

-    movq        mm5, MMWORD PTR [GLOBAL(rd)]

-    mov         rsi, arg(0)             ;src_ptr

-    mov         rax, rsi

-    add         rax, rdx

-.vp9_filter_block1d4_v4_ssse3_loop:

-    movd        mm2, DWORD PTR [rsi + rdx]            ;B

-    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C

-    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D

-    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E

-    punpcklbw   mm2, mm4                  ;B D

-    punpcklbw   mm3, mm0                  ;C E

-    pmaddubsw   mm3, mm6

-    pmaddubsw   mm2, mm7

-    add         rsi,  rdx

-    add         rax,  rdx

-;--

-;--

-    paddsw      mm2, mm3

-    paddsw      mm2, mm5

-    psraw       mm2, 7

-    packuswb    mm2, mm2

-    movd        DWORD PTR [rdi], mm2

-%if ABI_IS_32BIT

-    add         rdi,        DWORD PTR arg(3) ;[out_pitch]

-%else

-    add         rdi,        r8

-%endif

-    dec         rcx

-    jnz         .vp9_filter_block1d4_v4_ssse3_loop

-    ; begin epilog

-    pop rdi

-    pop rsi

-    RESTORE_GOT

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict16x16_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE

-sym(vp9_bilinear_predict16x16_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

-        movsxd      rax,        dword ptr arg(2)    ; xoffset

-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

-        je          .b16x16_sp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; HFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        mov         rsi,        arg(0)              ; src_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm1,       [rax]

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

-        je          .b16x16_fp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line

-        movdqa      xmm2,       [rax]

-%if ABI_IS_32BIT=0

-        movsxd      r8,         dword ptr arg(5)    ; dst_pitch

-%endif

-        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rdx]         ; next line

-        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14

-        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16

-        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-.next_row:

-        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm6,       xmm5

-        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rdx]         ; next line

-        pmaddubsw   xmm6,       xmm1

-        punpcklbw   xmm4,       xmm5

-        pmaddubsw   xmm4,       xmm1

-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

-        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value

-        psraw       xmm4,       VP9_FILTER_SHIFT    ; xmm4 /= 128

-        packuswb    xmm6,       xmm4

-        movdqa      xmm5,       xmm7

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm5,       xmm2

-        punpckhbw   xmm7,       xmm6

-        pmaddubsw   xmm7,       xmm2

-        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value

-        psraw       xmm5,       VP9_FILTER_SHIFT    ; xmm5 /= 128

-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

-        packuswb    xmm5,       xmm7

-        movdqa      xmm7,       xmm6

-        movdqa      [rdi],      xmm5                ; store the results in the destination

-%if ABI_IS_32BIT

-        add         rdi,        DWORD PTR arg(5)    ; dst_pitch

-%else

-        add         rdi,        r8

-%endif

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done

-.b16x16_sp_only:

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        mov         rsi,        arg(0)              ; src_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm1,       [rax]               ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

-        ; get the first horizontal line done

-        movq        xmm4,       [rsi]               ; load row 0

-        movq        xmm2,       [rsi + 8]           ; load row 0

-        lea         rsi,        [rsi + rax]         ; next line

-.next_row_sp:

-        movq        xmm3,       [rsi]               ; load row + 1

-        movq        xmm5,       [rsi + 8]           ; load row + 1

-        punpcklbw   xmm4,       xmm3

-        punpcklbw   xmm2,       xmm5

-        pmaddubsw   xmm4,       xmm1

-        movq        xmm7,       [rsi + rax]         ; load row + 2

-        pmaddubsw   xmm2,       xmm1

-        movq        xmm6,       [rsi + rax + 8]     ; load row + 2

-        punpcklbw   xmm3,       xmm7

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm3,       xmm1

-        paddw       xmm4,       [GLOBAL(rd)]

-        pmaddubsw   xmm5,       xmm1

-        paddw       xmm2,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        packuswb    xmm4,       xmm2

-        paddw       xmm3,       [GLOBAL(rd)]

-        movdqa      [rdi],      xmm4                ; store row 0

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        packuswb    xmm3,       xmm5

-        movdqa      xmm4,       xmm7

-        movdqa      [rdi + rdx],xmm3                ; store row 1

-        lea         rsi,        [rsi + 2*rax]

-        movdqa      xmm2,       xmm6

-        lea         rdi,        [rdi + 2*rdx]

-        cmp         rdi,        rcx

-        jne         .next_row_sp

-        jmp         .done

-.b16x16_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-        lea         rcx,        [rcx+rdx*8]

-        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line

-.next_row_fp:

-        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07

-        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08

-        punpcklbw   xmm2,       xmm4

-        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15

-        pmaddubsw   xmm2,       xmm1

-        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16

-        lea         rsi,        [rsi + rax]         ; next line

-        punpcklbw   xmm3,       xmm4

-        pmaddubsw   xmm3,       xmm1

-        movq        xmm5,       [rsi]

-        paddw       xmm2,       [GLOBAL(rd)]

-        movq        xmm7,       [rsi+1]

-        movq        xmm6,       [rsi+8]

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        punpcklbw   xmm5,       xmm7

-        movq        xmm7,       [rsi+9]

-        paddw       xmm3,       [GLOBAL(rd)]

-        pmaddubsw   xmm5,       xmm1

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        punpcklbw   xmm6,       xmm7

-        packuswb    xmm2,       xmm3

-        pmaddubsw   xmm6,       xmm1

-        movdqa      [rdi],      xmm2                ; store the results in the destination

-        paddw       xmm5,       [GLOBAL(rd)]

-        lea         rdi,        [rdi + rdx]         ; dst_pitch

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        paddw       xmm6,       [GLOBAL(rd)]

-        psraw       xmm6,       VP9_FILTER_SHIFT

-        packuswb    xmm5,       xmm6

-        lea         rsi,        [rsi + rax]         ; next line

-        movdqa      [rdi],      xmm5                ; store the results in the destination

-        lea         rdi,        [rdi + rdx]         ; dst_pitch

-        cmp         rdi,        rcx

-        jne         .next_row_fp

-.done:

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-;void vp9_bilinear_predict8x8_ssse3

-;(

-;    unsigned char  *src_ptr,

-;    int   src_pixels_per_line,

-;    int  xoffset,

-;    int  yoffset,

-;    unsigned char *dst_ptr,

-;    int dst_pitch

-;)

-global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE

-sym(vp9_bilinear_predict8x8_ssse3):

-    push        rbp

-    mov         rbp, rsp

-    SHADOW_ARGS_TO_STACK 6

-    SAVE_XMM 7

-    GET_GOT     rbx

-    push        rsi

-    push        rdi

-    ; end prolog

-    ALIGN_STACK 16, rax

-    sub         rsp, 144                         ; reserve 144 bytes

-        lea         rcx,        [GLOBAL(bilinear_filters_ssse3)]

-        mov         rsi,        arg(0) ;src_ptr

-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

-    ;Read 9-line unaligned data in and put them on stack. This gives a big

-    ;performance boost.

-        movdqu      xmm0,       [rsi]

-        lea         rax,        [rdx + rdx*2]

-        movdqu      xmm1,       [rsi+rdx]

-        movdqu      xmm2,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm3,       [rsi]

-        movdqu      xmm4,       [rsi+rdx]

-        movdqu      xmm5,       [rsi+rdx*2]

-        add         rsi,        rax

-        movdqu      xmm6,       [rsi]

-        movdqu      xmm7,       [rsi+rdx]

-        movdqa      XMMWORD PTR [rsp],            xmm0

-        movdqu      xmm0,       [rsi+rdx*2]

-        movdqa      XMMWORD PTR [rsp+16],         xmm1

-        movdqa      XMMWORD PTR [rsp+32],         xmm2

-        movdqa      XMMWORD PTR [rsp+48],         xmm3

-        movdqa      XMMWORD PTR [rsp+64],         xmm4

-        movdqa      XMMWORD PTR [rsp+80],         xmm5

-        movdqa      XMMWORD PTR [rsp+96],         xmm6

-        movdqa      XMMWORD PTR [rsp+112],        xmm7

-        movdqa      XMMWORD PTR [rsp+128],        xmm0

-        movsxd      rax,        dword ptr arg(2)    ; xoffset

-        cmp         rax,        0                   ; skip first_pass filter if xoffset=0

-        je          .b8x8_sp_only

-        shl         rax,        4

-        add         rax,        rcx                 ; HFilter

-        mov         rdi,        arg(4)              ; dst_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm0,       [rax]

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        cmp         rax,        0                   ; skip second_pass filter if yoffset=0

-        je          .b8x8_fp_only

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        lea         rcx,        [rdi+rdx*8]

-        movdqa      xmm1,       [rax]

-        ; get the first horizontal line done

-        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx

-        psrldq      xmm5,       1

-        lea         rsp,        [rsp + 16]          ; next line

-        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08

-        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14

-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

-        psraw       xmm3,       VP9_FILTER_SHIFT    ; xmm3 /= 128

-        movdqa      xmm7,       xmm3

-        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-.next_row:

-        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

-        lea         rsp,        [rsp + 16]          ; next line

-        movdqa      xmm5,       xmm6

-        psrldq      xmm5,       1

-        punpcklbw   xmm6,       xmm5

-        pmaddubsw   xmm6,       xmm0

-        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value

-        psraw       xmm6,       VP9_FILTER_SHIFT    ; xmm6 /= 128

-        packuswb    xmm6,       xmm6

-        punpcklbw   xmm7,       xmm6

-        pmaddubsw   xmm7,       xmm1

-        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value

-        psraw       xmm7,       VP9_FILTER_SHIFT    ; xmm7 /= 128

-        packuswb    xmm7,       xmm7

-        movq        [rdi],      xmm7                ; store the results in the destination

-        lea         rdi,        [rdi + rdx]

-        movdqa      xmm7,       xmm6

-        cmp         rdi,        rcx

-        jne         .next_row

-        jmp         .done8x8

-.b8x8_sp_only:

-        movsxd      rax,        dword ptr arg(3)    ; yoffset

-        shl         rax,        4

-        lea         rax,        [rax + rcx]         ; VFilter

-        mov         rdi,        arg(4) ;dst_ptr

-        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch

-        movdqa      xmm0,       [rax]               ; VFilter

-        movq        xmm1,       XMMWORD PTR [rsp]

-        movq        xmm2,       XMMWORD PTR [rsp+16]

-        movq        xmm3,       XMMWORD PTR [rsp+32]

-        punpcklbw   xmm1,       xmm2

-        movq        xmm4,       XMMWORD PTR [rsp+48]

-        punpcklbw   xmm2,       xmm3

-        movq        xmm5,       XMMWORD PTR [rsp+64]

-        punpcklbw   xmm3,       xmm4

-        movq        xmm6,       XMMWORD PTR [rsp+80]

-        punpcklbw   xmm4,       xmm5

-        movq        xmm7,       XMMWORD PTR [rsp+96]

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm1,       xmm0

-        pmaddubsw   xmm2,       xmm0

-        pmaddubsw   xmm3,       xmm0

-        pmaddubsw   xmm4,       xmm0

-        pmaddubsw   xmm5,       xmm0

-        punpcklbw   xmm6,       xmm7

-        pmaddubsw   xmm6,       xmm0

-        paddw       xmm1,       [GLOBAL(rd)]

-        paddw       xmm2,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        paddw       xmm3,       [GLOBAL(rd)]

-        psraw       xmm2,       VP9_FILTER_SHIFT

-        paddw       xmm4,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm4,       VP9_FILTER_SHIFT

-        paddw       xmm6,       [GLOBAL(rd)]

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        psraw       xmm6,       VP9_FILTER_SHIFT

-        packuswb    xmm1,       xmm1

-        packuswb    xmm2,       xmm2

-        movq        [rdi],      xmm1

-        packuswb    xmm3,       xmm3

-        movq        [rdi+rdx],  xmm2

-        packuswb    xmm4,       xmm4

-        movq        xmm1,       XMMWORD PTR [rsp+112]

-        lea         rdi,        [rdi + 2*rdx]

-        movq        xmm2,       XMMWORD PTR [rsp+128]

-        packuswb    xmm5,       xmm5

-        movq        [rdi],      xmm3

-        packuswb    xmm6,       xmm6

-        movq        [rdi+rdx],  xmm4

-        lea         rdi,        [rdi + 2*rdx]

-        punpcklbw   xmm7,       xmm1

-        movq        [rdi],      xmm5

-        pmaddubsw   xmm7,       xmm0

-        movq        [rdi+rdx],  xmm6

-        punpcklbw   xmm1,       xmm2

-        pmaddubsw   xmm1,       xmm0

-        paddw       xmm7,       [GLOBAL(rd)]

-        psraw       xmm7,       VP9_FILTER_SHIFT

-        paddw       xmm1,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        packuswb    xmm7,       xmm7

-        packuswb    xmm1,       xmm1

-        lea         rdi,        [rdi + 2*rdx]

-        movq        [rdi],      xmm7

-        movq        [rdi+rdx],  xmm1

-        lea         rsp,        [rsp + 144]

-        jmp         .done8x8

-.b8x8_fp_only:

-        lea         rcx,        [rdi+rdx*8]

-.next_row_fp:

-        movdqa      xmm1,       XMMWORD PTR [rsp]

-        movdqa      xmm3,       XMMWORD PTR [rsp+16]

-        movdqa      xmm2,       xmm1

-        movdqa      xmm5,       XMMWORD PTR [rsp+32]

-        psrldq      xmm2,       1

-        movdqa      xmm7,       XMMWORD PTR [rsp+48]

-        movdqa      xmm4,       xmm3

-        psrldq      xmm4,       1

-        movdqa      xmm6,       xmm5

-        psrldq      xmm6,       1

-        punpcklbw   xmm1,       xmm2

-        pmaddubsw   xmm1,       xmm0

-        punpcklbw   xmm3,       xmm4

-        pmaddubsw   xmm3,       xmm0

-        punpcklbw   xmm5,       xmm6

-        pmaddubsw   xmm5,       xmm0

-        movdqa      xmm2,       xmm7

-        psrldq      xmm2,       1

-        punpcklbw   xmm7,       xmm2

-        pmaddubsw   xmm7,       xmm0

-        paddw       xmm1,       [GLOBAL(rd)]

-        psraw       xmm1,       VP9_FILTER_SHIFT

-        paddw       xmm3,       [GLOBAL(rd)]

-        psraw       xmm3,       VP9_FILTER_SHIFT

-        paddw       xmm5,       [GLOBAL(rd)]

-        psraw       xmm5,       VP9_FILTER_SHIFT

-        paddw       xmm7,       [GLOBAL(rd)]

-        psraw       xmm7,       VP9_FILTER_SHIFT

-        packuswb    xmm1,       xmm1

-        packuswb    xmm3,       xmm3

-        packuswb    xmm5,       xmm5

-        movq        [rdi],      xmm1

-        packuswb    xmm7,       xmm7

-        movq        [rdi+rdx],  xmm3

-        lea         rdi,        [rdi + 2*rdx]

-        movq        [rdi],      xmm5

-        lea         rsp,        [rsp + 4*16]

-        movq        [rdi+rdx],  xmm7

-        lea         rdi,        [rdi + 2*rdx]

-        cmp         rdi,        rcx

-        jne         .next_row_fp

-        lea         rsp,        [rsp + 16]

-.done8x8:

-    ;add rsp, 144

-    pop         rsp

-    ; begin epilog

-    pop         rdi

-    pop         rsi

-    RESTORE_GOT

-    RESTORE_XMM

-    UNSHADOW_ARGS

-    pop         rbp

-    ret

-SECTION_RODATA

-align 16

-shuf1b:

-    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12

-shuf2b:

-    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11

-shuf3b:

-    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10

-align 16

-shuf2bfrom1:

-    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13

-align 16

-shuf3bfrom1:

-    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11

-align 16

-rd:

-    times 8 dw 0x40

-align 16

-k0_k5:

-    times 8 db 0, 0             ;placeholder

-    times 8 db 0, 0

-    times 8 db 2, 1

-    times 8 db 0, 0

-    times 8 db 3, 3

-    times 8 db 0, 0

-    times 8 db 1, 2

-    times 8 db 0, 0

-k1_k3:

-    times 8 db  0,    0         ;placeholder

-    times 8 db  -6,  12

-    times 8 db -11,  36

-    times 8 db  -9,  50

-    times 8 db -16,  77

-    times 8 db  -6,  93

-    times 8 db  -8, 108

-    times 8 db  -1, 123

-k2_k4:

-    times 8 db 128,    0        ;placeholder

-    times 8 db 123,   -1

-    times 8 db 108,   -8

-    times 8 db  93,   -6

-    times 8 db  77,  -16

-    times 8 db  50,   -9

-    times 8 db  36,  -11

-    times 8 db  12,   -6

-align 16

-bilinear_filters_ssse3:

-    times 8 db 128, 0

-    times 8 db 120, 8

-    times 8 db 112, 16

-    times 8 db 104, 24

-    times 8 db 96,  32

-    times 8 db 88,  40

-    times 8 db 80,  48

-    times 8 db 72,  56

-    times 8 db 64,  64

-    times 8 db 56,  72

-    times 8 db 48,  80

-    times 8 db 40,  88

-    times 8 db 32,  96

-    times 8 db 24,  104

-    times 8 db 16,  112

-    times 8 db 8,   120

--- a/vp9/common/x86/vp9_subpixel_x86.h

+++ /dev/null

@@ -1,109 +1,0 @@

-/*

- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_

-#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_

-/* Note:

- *

- * This platform is commonly built for runtime CPU detection. If you modify

- * any of the function mappings present in this file, be sure to also update

- * them in the function pointer initialization code

- */

-#if HAVE_MMX

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);

-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx

-#endif

-#endif

-#if HAVE_SSE2

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2

-#endif

-#endif

-#if HAVE_SSSE3

-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);

-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);

-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);

-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);

-#if !CONFIG_RUNTIME_CPU_DETECT

-#undef  vp9_subpix_sixtap16x16

-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3

-#undef  vp9_subpix_sixtap8x8

-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3

-#undef  vp9_subpix_sixtap8x4

-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3

-#undef  vp9_subpix_sixtap4x4

-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3

-#undef  vp9_subpix_bilinear16x16

-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3

-#undef  vp9_subpix_bilinear8x8

-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3

-#endif

-#endif

-#endif

--- a/vp9/encoder/vp9_onyx_if.c

+++ b/vp9/encoder/vp9_onyx_if.c

@@ -11,6 +11,7 @@

 #include "vpx_config.h"

 #include "vp9/common/vp9_onyxc_int.h"

+#include "vp9/common/vp9_reconinter.h"

 #include "vp9/encoder/vp9_onyx_int.h"

 #include "vp9/common/vp9_systemdependent.h"

 #include "vp9/encoder/vp9_quantize.h"

@@ -3775,6 +3776,7 @@

   cm->fb_idx_ref_cnt[cm->new_fb_idx]--;

   cm->new_fb_idx = get_free_fb(cm);

+  vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);

   if (cpi->pass == 1) {

     Pass1Encode(cpi, size, dest, frame_flags);

   } else if (cpi->pass == 2) {

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -2237,9 +2237,9 @@

       BLOCK *be = &x->block[i];

       int thisdistortion;

-      vp9_build_inter_predictors_b(bd, 16, xd->subpixel_predict4x4);

+      vp9_build_inter_predictors_b(bd, 16, &xd->subpix);

       if (xd->mode_info_context->mbmi.second_ref_frame > 0)

-        vp9_build_2nd_inter_predictors_b(bd, 16, xd->subpixel_predict_avg4x4);

+        vp9_build_2nd_inter_predictors_b(bd, 16, &xd->subpix);

       vp9_subtract_b(be, bd, 16);

       x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);

       x->quantize_b_4x4(be, bd);

--- a/vp9/encoder/vp9_temporal_filter.c

+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -50,12 +50,11 @@

   // Y

   yptr = y_mb_ptr + (mv_row >> 3) * stride + (mv_col >> 3);

-  if ((mv_row | mv_col) & 7) {

-    xd->subpixel_predict16x16(yptr, stride,

-                             (mv_col & 7) << 1, (mv_row & 7) << 1, &pred[0], 16);

-  } else {

-    vp9_copy_mem16x16(yptr, stride, &pred[0], 16);

-  }

+  xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][0](

+      yptr, stride, &pred[0], 16,

+      xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,

+      16, 16);

   // U & V

   omv_row = mv_row;

@@ -67,15 +66,17 @@

   uptr = u_mb_ptr + offset;

   vptr = v_mb_ptr + offset;

-  if ((omv_row | omv_col) & 15) {

-    xd->subpixel_predict8x8(uptr, stride,

-                           (omv_col & 15), (omv_row & 15), &pred[256], 8);

-    xd->subpixel_predict8x8(vptr, stride,

-                           (omv_col & 15), (omv_row & 15), &pred[320], 8);

-  } else {

-    vp9_copy_mem8x8(uptr, stride, &pred[256], 8);

-    vp9_copy_mem8x8(vptr, stride, &pred[320], 8);

-  }

+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](

+      uptr, stride, &pred[256], 8,

+      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,

+      8, 8);

+  xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][0](

+      vptr, stride, &pred[320], 8,

+      xd->subpix.filter_x[(omv_col & 15)], xd->subpix.x_step_q4,

+      xd->subpix.filter_y[(omv_row & 15)], xd->subpix.y_step_q4,

+      8, 8);

 void vp9_temporal_filter_apply_c(uint8_t *frame1,

--- a/vp9/encoder/vp9_variance_c.c

+++ b/vp9/encoder/vp9_variance_c.c

@@ -142,8 +142,8 @@

   const int16_t *HFilter, *VFilter;

   uint16_t FData3[5 * 4];  // Temp data bufffer used in filtering

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   // First filter 1d Horizontal

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);

@@ -166,8 +166,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);

@@ -186,8 +186,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);

@@ -206,8 +206,8 @@

   uint8_t temp2[68 * 64];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

                                     1, 65, 64, HFilter);

@@ -227,8 +227,8 @@

   uint8_t temp2[36 * 32];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 33, 32, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 32, 32, 32, 32, VFilter);

@@ -367,8 +367,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);

   var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);

@@ -387,8 +387,8 @@

   uint8_t temp2[20 * 16];

   const int16_t *HFilter, *VFilter;

-  HFilter = vp9_bilinear_filters[xoffset];

-  VFilter = vp9_bilinear_filters[yoffset];

+  HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);

+  VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);

   var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line,

                                     1, 17, 8, HFilter);

--- a/vp9/vp9_common.mk

+++ b/vp9/vp9_common.mk

@@ -16,6 +16,8 @@

 VP9_COMMON_SRCS-yes += common/vp9_asm_com_offsets.c

 VP9_COMMON_SRCS-yes += common/vp9_blockd.c

 VP9_COMMON_SRCS-yes += common/vp9_coefupdateprobs.h

+VP9_COMMON_SRCS-yes += common/vp9_convolve.c

+VP9_COMMON_SRCS-yes += common/vp9_convolve.h

 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c

 VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h

 VP9_COMMON_SRCS-yes += common/vp9_entropy.c

@@ -54,7 +56,6 @@

 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h

 VP9_COMMON_SRCS-yes += common/vp9_seg_common.c

 VP9_COMMON_SRCS-yes += common/vp9_setupintrarecon.h

-VP9_COMMON_SRCS-yes += common/vp9_subpixel.h

 VP9_COMMON_SRCS-yes += common/vp9_swapyv12buffer.h

 VP9_COMMON_SRCS-yes += common/vp9_systemdependent.h

 VP9_COMMON_SRCS-yes += common/vp9_textblit.h

@@ -79,7 +80,6 @@

 VP9_COMMON_SRCS-$(CONFIG_IMPLICIT_SEGMENTATION) += common/vp9_implicit_segmentation.c

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.h

-VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_subpixel_x86.h

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_x86.h

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h

 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c

@@ -88,7 +88,6 @@

 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm

-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_subpixel_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm

@@ -96,10 +95,8 @@

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpel_variance_impl_sse2.asm

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c

 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm

-VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm

 ifeq ($(CONFIG_POSTPROC),yes)

 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm

@@ -111,19 +108,10 @@

 VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm

 endif

-VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_filter_sse4.c

-ifeq ($(HAVE_SSE4_1),yes)

-vp9/common/x86/vp9_filter_sse4.c.o: CFLAGS += -msse4

-vp9/common/x86/vp9_filter_sse4.c.d: CFLAGS += -msse4

-endif

-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_filter_sse2.c

 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c

 ifeq ($(HAVE_SSE2),yes)

-vp9/common/x86/vp9_filter_sse2.c.o: CFLAGS += -msse2

 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2

 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2

-vp9/common/x86/vp9_filter_sse2.c.d: CFLAGS += -msse2

 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2

 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2

 endif