shithub: libvpx

--- a/test/vp9_subtract_test.cc

+++ b/test/vp9_subtract_test.cc

@@ -14,6 +14,7 @@

 #include "test/register_state_check.h"

 #include "./vpx_config.h"

 #include "./vp9_rtcd.h"

+#include "./vpx_dsp_rtcd.h"

 #include "vp9/common/vp9_blockd.h"

 #include "vpx_mem/vpx_mem.h"

@@ -89,15 +90,15 @@

 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,

-                        ::testing::Values(vp9_subtract_block_c));

+                        ::testing::Values(vpx_subtract_block_c));

 #if HAVE_SSE2 && CONFIG_USE_X86INC

 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,

-                        ::testing::Values(vp9_subtract_block_sse2));

+                        ::testing::Values(vpx_subtract_block_sse2));

 #endif

 #if HAVE_NEON

 INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,

-                        ::testing::Values(vp9_subtract_block_neon));

+                        ::testing::Values(vpx_subtract_block_neon));

 #endif

 }  // namespace vp9

--- a/vp8/encoder/encodemb.c

+++ b/vp8/encoder/encodemb.c

@@ -19,6 +19,8 @@

 #include "vpx_mem/vpx_mem.h"

 #include "rdopt.h"

+// TODO(jingning,johannkoenig): use vpx_subtract_block to replace

+// codec specified vp9_subtract_ functions.

 void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)

     unsigned char *src_ptr = (*(be->base_src) + be->src);

--- a/vp9/common/vp9_rtcd_defs.pl

+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -922,9 +922,6 @@

 # ENCODEMB INVOKE

-add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";

-specialize qw/vp9_subtract_block neon msa/, "$sse2_x86inc";

 # Denoiser

@@ -1327,9 +1324,6 @@

   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";

   specialize qw/vp9_highbd_block_error sse2/;

-  add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";

-  specialize qw/vp9_highbd_subtract_block/;

   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";

   specialize qw/vp9_highbd_quantize_fp/;

--- a/vp9/encoder/arm/neon/vp9_subtract_neon.c

+++ /dev/null

@@ -1,81 +1,0 @@

-/*

- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include <arm_neon.h>

-#include "./vp9_rtcd.h"

-#include "./vpx_config.h"

-#include "vpx/vpx_integer.h"

-void vp9_subtract_block_neon(int rows, int cols,

-                             int16_t *diff, ptrdiff_t diff_stride,

-                             const uint8_t *src, ptrdiff_t src_stride,

-                             const uint8_t *pred, ptrdiff_t pred_stride) {

-  int r, c;

-  if (cols > 16) {

-    for (r = 0; r < rows; ++r) {

-      for (c = 0; c < cols; c += 32) {

-        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);

-        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);

-        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c +  0]);

-        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);

-        const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),

-                                                 vget_low_u8(v_pred_00));

-        const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),

-                                                 vget_high_u8(v_pred_00));

-        const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),

-                                                 vget_low_u8(v_pred_16));

-        const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),

-                                                 vget_high_u8(v_pred_16));

-        vst1q_s16(&diff[c +  0], vreinterpretq_s16_u16(v_diff_lo_00));

-        vst1q_s16(&diff[c +  8], vreinterpretq_s16_u16(v_diff_hi_00));

-        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));

-        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));

-      }

-      diff += diff_stride;

-      pred += pred_stride;

-      src  += src_stride;

-    }

-  } else if (cols > 8) {

-    for (r = 0; r < rows; ++r) {

-      const uint8x16_t v_src = vld1q_u8(&src[0]);

-      const uint8x16_t v_pred = vld1q_u8(&pred[0]);

-      const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),

-                                            vget_low_u8(v_pred));

-      const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),

-                                            vget_high_u8(v_pred));

-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));

-      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));

-      diff += diff_stride;

-      pred += pred_stride;

-      src  += src_stride;

-    }

-  } else if (cols > 4) {

-    for (r = 0; r < rows; ++r) {

-      const uint8x8_t v_src = vld1_u8(&src[0]);

-      const uint8x8_t v_pred = vld1_u8(&pred[0]);

-      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);

-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));

-      diff += diff_stride;

-      pred += pred_stride;

-      src  += src_stride;

-    }

-  } else {

-    for (r = 0; r < rows; ++r) {

-      for (c = 0; c < cols; ++c)

-        diff[c] = src[c] - pred[c];

-      diff += diff_stride;

-      pred += pred_stride;

-      src  += src_stride;

-    }

-  }

-}

--- a/vp9/encoder/mips/msa/vp9_subtract_msa.c

+++ /dev/null

@@ -1,264 +1,0 @@

-/*

- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

- *

- *  Use of this source code is governed by a BSD-style license

- *  that can be found in the LICENSE file in the root of the source

- *  tree. An additional intellectual property rights grant can be found

- *  in the file PATENTS.  All contributing project authors may

- *  be found in the AUTHORS file in the root of the source tree.

- */

-#include "./vp9_rtcd.h"

-#include "vp9/common/mips/msa/vp9_macros_msa.h"

-static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,

-                            const uint8_t *pred_ptr, int32_t pred_stride,

-                            int16_t *diff_ptr, int32_t diff_stride) {

-  uint32_t src0, src1, src2, src3;

-  uint32_t pred0, pred1, pred2, pred3;

-  v16i8 src = { 0 };

-  v16i8 pred = { 0 };

-  v16u8 src_l0, src_l1;

-  v8i16 diff0, diff1;

-  LW4(src_ptr, src_stride, src0, src1, src2, src3);

-  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);

-  INSERT_W4_SB(src0, src1, src2, src3, src);

-  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);

-  ILVRL_B2_UB(src, pred, src_l0, src_l1);

-  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));

-}

-static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,

-                            const uint8_t *pred_ptr, int32_t pred_stride,

-                            int16_t *diff_ptr, int32_t diff_stride) {

-  uint32_t loop_cnt;

-  uint64_t src0, src1, pred0, pred1;

-  v16i8 src = { 0 };

-  v16i8 pred = { 0 };

-  v16u8 src_l0, src_l1;

-  v8i16 diff0, diff1;

-  for (loop_cnt = 4; loop_cnt--;) {

-    LD2(src_ptr, src_stride, src0, src1);

-    src_ptr += (2 * src_stride);

-    LD2(pred_ptr, pred_stride, pred0, pred1);

-    pred_ptr += (2 * pred_stride);

-    INSERT_D2_SB(src0, src1, src);

-    INSERT_D2_SB(pred0, pred1, pred);

-    ILVRL_B2_UB(src, pred, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff_ptr, diff_stride);

-    diff_ptr += (2 * diff_stride);

-  }

-}

-static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,

-                              const uint8_t *pred, int32_t pred_stride,

-                              int16_t *diff, int32_t diff_stride) {

-  int8_t count;

-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;

-  v16u8 src_l0, src_l1;

-  v8i16 diff0, diff1;

-  for (count = 2; count--;) {

-    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);

-    src += (8 * src_stride);

-    LD_SB8(pred, pred_stride,

-           pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);

-    pred += (8 * pred_stride);

-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    diff += diff_stride;

-  }

-}

-static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,

-                              const uint8_t *pred, int32_t pred_stride,

-                              int16_t *diff, int32_t diff_stride) {

-  uint32_t loop_cnt;

-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;

-  v16u8 src_l0, src_l1;

-  v8i16 diff0, diff1;

-  for (loop_cnt = 8; loop_cnt--;) {

-    LD_SB2(src, 16, src0, src1);

-    src += src_stride;

-    LD_SB2(src, 16, src2, src3);

-    src += src_stride;

-    LD_SB2(src, 16, src4, src5);

-    src += src_stride;

-    LD_SB2(src, 16, src6, src7);

-    src += src_stride;

-    LD_SB2(pred, 16, pred0, pred1);

-    pred += pred_stride;

-    LD_SB2(pred, 16, pred2, pred3);

-    pred += pred_stride;

-    LD_SB2(pred, 16, pred4, pred5);

-    pred += pred_stride;

-    LD_SB2(pred, 16, pred6, pred7);

-    pred += pred_stride;

-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 16, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 16, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 16, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 16, 8);

-    diff += diff_stride;

-  }

-}

-static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,

-                              const uint8_t *pred, int32_t pred_stride,

-                              int16_t *diff, int32_t diff_stride) {

-  uint32_t loop_cnt;

-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;

-  v16u8 src_l0, src_l1;

-  v8i16 diff0, diff1;

-  for (loop_cnt = 32; loop_cnt--;) {

-    LD_SB4(src, 16, src0, src1, src2, src3);

-    src += src_stride;

-    LD_SB4(src, 16, src4, src5, src6, src7);

-    src += src_stride;

-    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);

-    pred += pred_stride;

-    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);

-    pred += pred_stride;

-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 16, 8);

-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 32, 8);

-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 48, 8);

-    diff += diff_stride;

-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff, 8);

-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 16, 8);

-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 32, 8);

-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);

-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

-    ST_SH2(diff0, diff1, diff + 48, 8);

-    diff += diff_stride;

-  }

-}

-void vp9_subtract_block_msa(int32_t rows, int32_t cols,

-                            int16_t *diff_ptr, ptrdiff_t diff_stride,

-                            const uint8_t *src_ptr, ptrdiff_t src_stride,

-                            const uint8_t *pred_ptr, ptrdiff_t pred_stride) {

-  if (rows == cols) {

-    switch (rows) {

-      case 4:

-        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,

-                        diff_ptr, diff_stride);

-        break;

-      case 8:

-        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,

-                        diff_ptr, diff_stride);

-        break;

-      case 16:

-        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,

-                          diff_ptr, diff_stride);

-        break;

-      case 32:

-        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,

-                          diff_ptr, diff_stride);

-        break;

-      case 64:

-        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,

-                          diff_ptr, diff_stride);

-        break;

-      default:

-        vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,

-                             src_stride, pred_ptr, pred_stride);

-        break;

-    }

-  } else {

-    vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,

-                         pred_ptr, pred_stride);

-  }

-}

--- a/vp9/encoder/vp9_encodemb.c

+++ b/vp9/encoder/vp9_encodemb.c

@@ -11,6 +11,7 @@

 #include "./vp9_rtcd.h"

 #include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_ports/mem.h"

@@ -31,45 +32,6 @@

   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];

};

-void vp9_subtract_block_c(int rows, int cols,

-                          int16_t *diff, ptrdiff_t diff_stride,

-                          const uint8_t *src, ptrdiff_t src_stride,

-                          const uint8_t *pred, ptrdiff_t pred_stride) {

-  int r, c;

-  for (r = 0; r < rows; r++) {

-    for (c = 0; c < cols; c++)

-      diff[c] = src[c] - pred[c];

-    diff += diff_stride;

-    pred += pred_stride;

-    src  += src_stride;

-  }

-}

-#if CONFIG_VP9_HIGHBITDEPTH

-void vp9_highbd_subtract_block_c(int rows, int cols,

-                                 int16_t *diff, ptrdiff_t diff_stride,

-                                 const uint8_t *src8, ptrdiff_t src_stride,

-                                 const uint8_t *pred8, ptrdiff_t pred_stride,

-                                 int bd) {

-  int r, c;

-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);

-  (void) bd;

-  for (r = 0; r < rows; r++) {

-    for (c = 0; c < cols; c++) {

-      diff[c] = src[c] - pred[c];

-    }

-    diff += diff_stride;

-    pred += pred_stride;

-    src  += src_stride;

-  }

-}

-#endif  // CONFIG_VP9_HIGHBITDEPTH

 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {

   struct macroblock_plane *const p = &x->plane[plane];

   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];

@@ -79,13 +41,13 @@

 #if CONFIG_VP9_HIGHBITDEPTH

   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-    vp9_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,

+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,

                               p->src.stride, pd->dst.buf, pd->dst.stride,

                               x->e_mbd.bd);

     return;

 #endif  // CONFIG_VP9_HIGHBITDEPTH

-  vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,

+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,

                      pd->dst.buf, pd->dst.stride);

@@ -838,7 +800,7 @@

                                 x->skip_encode ? src_stride : dst_stride,

                                 dst, dst_stride, i, j, plane);

         if (!x->skip_recode) {

-          vp9_highbd_subtract_block(32, 32, src_diff, diff_stride,

+          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,

                                     src, src_stride, dst, dst_stride, xd->bd);

           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

           vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,

@@ -859,7 +821,7 @@

                                 x->skip_encode ? src_stride : dst_stride,

                                 dst, dst_stride, i, j, plane);

         if (!x->skip_recode) {

-          vp9_highbd_subtract_block(16, 16, src_diff, diff_stride,

+          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,

                                     src, src_stride, dst, dst_stride, xd->bd);

           vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);

           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,

@@ -881,7 +843,7 @@

                                 x->skip_encode ? src_stride : dst_stride,

                                 dst, dst_stride, i, j, plane);

         if (!x->skip_recode) {

-          vp9_highbd_subtract_block(8, 8, src_diff, diff_stride,

+          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,

                                     src, src_stride, dst, dst_stride, xd->bd);

           vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);

           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,

@@ -904,7 +866,7 @@

                                 dst, dst_stride, i, j, plane);

         if (!x->skip_recode) {

-          vp9_highbd_subtract_block(4, 4, src_diff, diff_stride,

+          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,

                                     src, src_stride, dst, dst_stride, xd->bd);

           if (tx_type != DCT_DCT)

             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);

@@ -946,7 +908,7 @@

                               x->skip_encode ? src_stride : dst_stride,

                               dst, dst_stride, i, j, plane);

       if (!x->skip_recode) {

-        vp9_subtract_block(32, 32, src_diff, diff_stride,

+        vpx_subtract_block(32, 32, src_diff, diff_stride,

                            src, src_stride, dst, dst_stride);

         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);

         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,

@@ -966,7 +928,7 @@

                               x->skip_encode ? src_stride : dst_stride,

                               dst, dst_stride, i, j, plane);

       if (!x->skip_recode) {

-        vp9_subtract_block(16, 16, src_diff, diff_stride,

+        vpx_subtract_block(16, 16, src_diff, diff_stride,

                            src, src_stride, dst, dst_stride);

         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);

         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,

@@ -986,7 +948,7 @@

                               x->skip_encode ? src_stride : dst_stride,

                               dst, dst_stride, i, j, plane);

       if (!x->skip_recode) {

-        vp9_subtract_block(8, 8, src_diff, diff_stride,

+        vpx_subtract_block(8, 8, src_diff, diff_stride,

                            src, src_stride, dst, dst_stride);

         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);

         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,

@@ -1007,7 +969,7 @@

                               dst, dst_stride, i, j, plane);

       if (!x->skip_recode) {

-        vp9_subtract_block(4, 4, src_diff, diff_stride,

+        vpx_subtract_block(4, 4, src_diff, diff_stride,

                            src, src_stride, dst, dst_stride);

         if (tx_type != DCT_DCT)

           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);

--- a/vp9/encoder/vp9_rdopt.c

+++ b/vp9/encoder/vp9_rdopt.c

@@ -12,6 +12,7 @@

 #include <math.h>

 #include "./vp9_rtcd.h"

+#include "./vpx_dsp_rtcd.h"

 #include "vpx_mem/vpx_mem.h"

 #include "vpx_ports/mem.h"

@@ -832,7 +833,7 @@

                                   x->skip_encode ? src : dst,

                                   x->skip_encode ? src_stride : dst_stride,

                                   dst, dst_stride, idx, idy, 0);

-          vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,

+          vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,

                                     dst, dst_stride, xd->bd);

           if (xd->lossless) {

             const scan_order *so = &vp9_default_scan_orders[TX_4X4];

@@ -932,7 +933,7 @@

                                 x->skip_encode ? src : dst,

                                 x->skip_encode ? src_stride : dst_stride,

                                 dst, dst_stride, idx, idy, 0);

-        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);

+        vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);

         if (xd->lossless) {

           const scan_order *so = &vp9_default_scan_orders[TX_4X4];

@@ -1394,16 +1395,16 @@

 #if CONFIG_VP9_HIGHBITDEPTH

   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

-    vp9_highbd_subtract_block(

+    vpx_highbd_subtract_block(

         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),

         8, src, p->src.stride, dst, pd->dst.stride, xd->bd);

   } else {

-    vp9_subtract_block(

+    vpx_subtract_block(

         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),

         8, src, p->src.stride, dst, pd->dst.stride);

 #else

-  vp9_subtract_block(height, width,

+  vpx_subtract_block(height, width,

                      vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),

                      8, src, p->src.stride, dst, pd->dst.stride);

 #endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vp9/encoder/x86/vp9_subtract_sse2.asm

+++ /dev/null

@@ -1,127 +1,0 @@

-;

-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

-;

-;  Use of this source code is governed by a BSD-style license

-;  that can be found in the LICENSE file in the root of the source

-;  tree. An additional intellectual property rights grant can be found

-;  in the file PATENTS.  All contributing project authors may

-;  be found in the AUTHORS file in the root of the source tree.

-;

-%include "third_party/x86inc/x86inc.asm"

-SECTION .text

-; void vp9_subtract_block(int rows, int cols,

-;                         int16_t *diff, ptrdiff_t diff_stride,

-;                         const uint8_t *src, ptrdiff_t src_stride,

-;                         const uint8_t *pred, ptrdiff_t pred_stride)

-INIT_XMM sse2

-cglobal subtract_block, 7, 7, 8, \

-                        rows, cols, diff, diff_stride, src, src_stride, \

-                        pred, pred_stride

-%define pred_str colsq

-  pxor                  m7, m7         ; dedicated zero register

-  cmp                colsd, 4

-  je .case_4

-  cmp                colsd, 8

-  je .case_8

-  cmp                colsd, 16

-  je .case_16

-  cmp                colsd, 32

-  je .case_32

-%macro loop16 6

-  mova                  m0, [srcq+%1]

-  mova                  m4, [srcq+%2]

-  mova                  m1, [predq+%3]

-  mova                  m5, [predq+%4]

-  punpckhbw             m2, m0, m7

-  punpckhbw             m3, m1, m7

-  punpcklbw             m0, m7

-  punpcklbw             m1, m7

-  psubw                 m2, m3

-  psubw                 m0, m1

-  punpckhbw             m1, m4, m7

-  punpckhbw             m3, m5, m7

-  punpcklbw             m4, m7

-  punpcklbw             m5, m7

-  psubw                 m1, m3

-  psubw                 m4, m5

-  mova [diffq+mmsize*0+%5], m0

-  mova [diffq+mmsize*1+%5], m2

-  mova [diffq+mmsize*0+%6], m4

-  mova [diffq+mmsize*1+%6], m1

-%endmacro

-  mov             pred_str, pred_stridemp

-.loop_64:

-  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize

-  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize

-  lea                diffq, [diffq+diff_strideq*2]

-  add                predq, pred_str

-  add                 srcq, src_strideq

-  dec                rowsd

-  jg .loop_64

-  RET

-.case_32:

-  mov             pred_str, pred_stridemp

-.loop_32:

-  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize

-  lea                diffq, [diffq+diff_strideq*2]

-  add                predq, pred_str

-  add                 srcq, src_strideq

-  dec                rowsd

-  jg .loop_32

-  RET

-.case_16:

-  mov             pred_str, pred_stridemp

-.loop_16:

-  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2

-  lea                diffq, [diffq+diff_strideq*4]

-  lea                predq, [predq+pred_str*2]

-  lea                 srcq, [srcq+src_strideq*2]

-  sub                rowsd, 2

-  jg .loop_16

-  RET

-%macro loop_h 0

-  movh                  m0, [srcq]

-  movh                  m2, [srcq+src_strideq]

-  movh                  m1, [predq]

-  movh                  m3, [predq+pred_str]

-  punpcklbw             m0, m7

-  punpcklbw             m1, m7

-  punpcklbw             m2, m7

-  punpcklbw             m3, m7

-  psubw                 m0, m1

-  psubw                 m2, m3

-  mova             [diffq], m0

-  mova [diffq+diff_strideq*2], m2

-%endmacro

-.case_8:

-  mov             pred_str, pred_stridemp

-.loop_8:

-  loop_h

-  lea                diffq, [diffq+diff_strideq*4]

-  lea                 srcq, [srcq+src_strideq*2]

-  lea                predq, [predq+pred_str*2]

-  sub                rowsd, 2

-  jg .loop_8

-  RET

-INIT_MMX

-.case_4:

-  mov             pred_str, pred_stridemp

-.loop_4:

-  loop_h

-  lea                diffq, [diffq+diff_strideq*4]

-  lea                 srcq, [srcq+src_strideq*2]

-  lea                predq, [predq+pred_str*2]

-  sub                rowsd, 2

-  jg .loop_4

-  RET

--- a/vp9/vp9cx.mk

+++ b/vp9/vp9cx.mk

@@ -114,7 +114,6 @@

 ifeq ($(CONFIG_USE_X86INC),yes)

 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm

-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c

 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

@@ -151,7 +150,6 @@

 endif

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c

-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c

 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c

@@ -161,7 +159,6 @@

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h

-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c

 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c

 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

--- /dev/null

+++ b/vpx_dsp/arm/subtract_neon.c

@@ -1,0 +1,80 @@

+/*

+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <arm_neon.h>

+#include "./vpx_config.h"

+#include "vpx/vpx_integer.h"

+void vpx_subtract_block_neon(int rows, int cols,

+                             int16_t *diff, ptrdiff_t diff_stride,

+                             const uint8_t *src, ptrdiff_t src_stride,

+                             const uint8_t *pred, ptrdiff_t pred_stride) {

+  int r, c;

+  if (cols > 16) {

+    for (r = 0; r < rows; ++r) {

+      for (c = 0; c < cols; c += 32) {

+        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);

+        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);

+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c +  0]);

+        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);

+        const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),

+                                                 vget_low_u8(v_pred_00));

+        const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),

+                                                 vget_high_u8(v_pred_00));

+        const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),

+                                                 vget_low_u8(v_pred_16));

+        const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),

+                                                 vget_high_u8(v_pred_16));

+        vst1q_s16(&diff[c +  0], vreinterpretq_s16_u16(v_diff_lo_00));

+        vst1q_s16(&diff[c +  8], vreinterpretq_s16_u16(v_diff_hi_00));

+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));

+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));

+      }

+      diff += diff_stride;

+      pred += pred_stride;

+      src  += src_stride;

+    }

+  } else if (cols > 8) {

+    for (r = 0; r < rows; ++r) {

+      const uint8x16_t v_src = vld1q_u8(&src[0]);

+      const uint8x16_t v_pred = vld1q_u8(&pred[0]);

+      const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),

+                                            vget_low_u8(v_pred));

+      const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),

+                                            vget_high_u8(v_pred));

+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));

+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));

+      diff += diff_stride;

+      pred += pred_stride;

+      src  += src_stride;

+    }

+  } else if (cols > 4) {

+    for (r = 0; r < rows; ++r) {

+      const uint8x8_t v_src = vld1_u8(&src[0]);

+      const uint8x8_t v_pred = vld1_u8(&pred[0]);

+      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);

+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));

+      diff += diff_stride;

+      pred += pred_stride;

+      src  += src_stride;

+    }

+  } else {

+    for (r = 0; r < rows; ++r) {

+      for (c = 0; c < cols; ++c)

+        diff[c] = src[c] - pred[c];

+      diff += diff_stride;

+      pred += pred_stride;

+      src  += src_stride;

+    }

+  }

+}

--- a/vpx_dsp/mips/macros_msa.h

+++ b/vpx_dsp/mips/macros_msa.h

@@ -24,6 +24,9 @@

 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)

 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)

+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)

+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)

 #if (__mips_isa_rev >= 6)

 #define LW(psrc) ({                                 \

   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

@@ -38,6 +41,61 @@

   val_m;                                            \

})

+#if (__mips == 64)

+#define LD(psrc) ({                                 \

+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

+  uint64_t val_m = 0;                               \

+                                                    \

+  __asm__ __volatile__ (                            \

+      "ld  %[val_m],  %[psrc_m]  \n\t"              \

+                                                    \

+      : [val_m] "=r" (val_m)                        \

+      : [psrc_m] "m" (*psrc_m)                      \

+  );                                                \

+                                                    \

+  val_m;                                            \

+})

+#else  // !(__mips == 64)

+#define LD(psrc) ({                                        \

+  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \

+  uint32_t val0_m, val1_m;                                 \

+  uint64_t val_m = 0;                                      \

+                                                           \

+  val0_m = LW(psrc_m);                                     \

+  val1_m = LW(psrc_m + 4);                                 \

+                                                           \

+  val_m = (uint64_t)(val1_m);                              \

+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \

+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \

+                                                           \

+  val_m;                                                   \

+})

+#endif  // (__mips == 64)

+#define SW(val, pdst) {                 \

+  uint8_t *pdst_m = (uint8_t *)(pdst);  \

+  const uint32_t val_m = (val);         \

+                                        \

+  __asm__ __volatile__ (                \

+      "sw  %[val_m],  %[pdst_m]  \n\t"  \

+                                        \

+      : [pdst_m] "=m" (*pdst_m)         \

+      : [val_m] "r" (val_m)             \

+  );                                    \

+}

+#define SD(val, pdst) {                 \

+  uint8_t *pdst_m = (uint8_t *)(pdst);  \

+  const uint64_t val_m = (val);         \

+                                        \

+  __asm__ __volatile__ (                \

+      "sd  %[val_m],  %[pdst_m]  \n\t"  \

+                                        \

+      : [pdst_m] "=m" (*pdst_m)         \

+      : [val_m] "r" (val_m)             \

+  );                                    \

+}

 #else  // !(__mips_isa_rev >= 6)

 #define LW(psrc) ({                                 \

   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

@@ -52,6 +110,60 @@

   val_m;                                            \

})

+#define SW(val, pdst) {                  \

+  uint8_t *pdst_m = (uint8_t *)(pdst);   \

+  const uint32_t val_m = (val);          \

+                                         \

+  __asm__ __volatile__ (                 \

+      "usw  %[val_m],  %[pdst_m]  \n\t"  \

+                                         \

+      : [pdst_m] "=m" (*pdst_m)          \

+      : [val_m] "r" (val_m)              \

+  );                                     \

+}

+#if (__mips == 64)

+#define LD(psrc) ({                                 \

+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \

+  uint64_t val_m = 0;                               \

+                                                    \

+  __asm__ __volatile__ (                            \

+      "uld  %[val_m],  %[psrc_m]  \n\t"             \

+                                                    \

+      : [val_m] "=r" (val_m)                        \

+      : [psrc_m] "m" (*psrc_m)                      \

+  );                                                \

+                                                    \

+  val_m;                                            \

+})

+#else  // !(__mips == 64)

+#define LD(psrc) ({                                        \

+  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \

+  uint32_t val0_m, val1_m;                                 \

+  uint64_t val_m = 0;                                      \

+                                                           \

+  val0_m = LW(psrc_m1);                                    \

+  val1_m = LW(psrc_m1 + 4);                                \

+                                                           \

+  val_m = (uint64_t)(val1_m);                              \

+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \

+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \

+                                                           \

+  val_m;                                                   \

+})

+#endif  // (__mips == 64)

+#define SD(val, pdst) {                                     \

+  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \

+  uint32_t val0_m, val1_m;                                  \

+                                                            \

+  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \

+  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \

+                                                            \

+  SW(val0_m, pdst_m1);                                      \

+  SW(val1_m, pdst_m1 + 4);                                  \

+}

 #endif  // (__mips_isa_rev >= 6)

 /* Description : Load 4 words with stride

@@ -69,6 +181,21 @@

   out3 = LW((psrc) + 3 * stride);                    \

+/* Description : Load double words with stride

+   Arguments   : Inputs  - psrc, stride

+                 Outputs - out0, out1

+   Details     : Load double word in 'out0' from (psrc)

+                 Load double word in 'out1' from (psrc + stride)

+*/

+#define LD2(psrc, stride, out0, out1) {  \

+  out0 = LD((psrc));                     \

+  out1 = LD((psrc) + stride);            \

+}

+#define LD4(psrc, stride, out0, out1, out2, out3) {  \

+  LD2((psrc), stride, out0, out1);                   \

+  LD2((psrc) + 2 * stride, stride, out2, out3);      \

+}

 /* Description : Load vectors with 16 byte elements with stride

    Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1

@@ -81,6 +208,7 @@

   out1 = LD_B(RTYPE, (psrc) + stride);            \

 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)

+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)

 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \

   LD_B2(RTYPE, (psrc), stride, out0, out1);             \

@@ -93,6 +221,7 @@

   LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \

 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)

+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)

 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \

   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \

@@ -100,6 +229,14 @@

 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)

+#define LD_B8(RTYPE, psrc, stride,                                    \

+              out0, out1, out2, out3, out4, out5, out6, out7) {       \

+  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \

+  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \

+}

+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)

+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)

 /* Description : Load vectors with 8 halfword elements with stride

    Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1

@@ -271,6 +408,13 @@

 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)

 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)

+#define INSERT_D2(RTYPE, in0, in1, out) {           \

+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \

+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \

+}

+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)

+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)

 /* Description : Interleave both left and right half of input vectors

    Arguments   : Inputs  - in0, in1

                  Outputs - out0, out1

@@ -327,5 +471,54 @@

   tmp_m = __msa_clti_s_h((v8i16)in, 0);  \

   ILVRL_H2_SW(tmp_m, in, out0, out1);    \

+}

+/* Description : Store 4 double words with stride

+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

+   Details     : Store double word from 'in0' to (pdst)

+                 Store double word from 'in1' to (pdst + stride)

+                 Store double word from 'in2' to (pdst + 2 * stride)

+                 Store double word from 'in3' to (pdst + 3 * stride)

+*/

+#define SD4(in0, in1, in2, in3, pdst, stride) {  \

+  SD(in0, (pdst))                                \

+  SD(in1, (pdst) + stride);                      \

+  SD(in2, (pdst) + 2 * stride);                  \

+  SD(in3, (pdst) + 3 * stride);                  \

+}

+/* Description : Store vectors of 8 halfword elements with stride

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 8 halfword elements from 'in0' to (pdst)

+                 Store 8 halfword elements from 'in1' to (pdst + stride)

+*/

+#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \

+  ST_H(RTYPE, in0, (pdst));                     \

+  ST_H(RTYPE, in1, (pdst) + stride);            \

+}

+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)

+/* Description : Store 8x4 byte block to destination memory from input

+                 vectors

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Index 0 double word element from 'in0' vector is copied to the

+                 GP register and stored to (pdst)

+                 Index 1 double word element from 'in0' vector is copied to the

+                 GP register and stored to (pdst + stride)

+                 Index 0 double word element from 'in1' vector is copied to the

+                 GP register and stored to (pdst + 2 * stride)

+                 Index 1 double word element from 'in1' vector is copied to the

+                 GP register and stored to (pdst + 3 * stride)

+*/

+#define ST8x4_UB(in0, in1, pdst, stride) {                  \

+  uint64_t out0_m, out1_m, out2_m, out3_m;                  \

+  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \

+                                                            \

+  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \

+  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \

+  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \

+  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \

+                                                            \

+  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \

 #endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */

--- /dev/null

+++ b/vpx_dsp/mips/subtract_msa.c

@@ -1,0 +1,264 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vpx_dsp_rtcd.h"

+#include "vpx_dsp/mips/macros_msa.h"

+static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,

+                            const uint8_t *pred_ptr, int32_t pred_stride,

+                            int16_t *diff_ptr, int32_t diff_stride) {

+  uint32_t src0, src1, src2, src3;

+  uint32_t pred0, pred1, pred2, pred3;

+  v16i8 src = { 0 };

+  v16i8 pred = { 0 };

+  v16u8 src_l0, src_l1;

+  v8i16 diff0, diff1;

+  LW4(src_ptr, src_stride, src0, src1, src2, src3);

+  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);

+  INSERT_W4_SB(src0, src1, src2, src3, src);

+  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);

+  ILVRL_B2_UB(src, pred, src_l0, src_l1);

+  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));

+}

+static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,

+                            const uint8_t *pred_ptr, int32_t pred_stride,

+                            int16_t *diff_ptr, int32_t diff_stride) {

+  uint32_t loop_cnt;

+  uint64_t src0, src1, pred0, pred1;

+  v16i8 src = { 0 };

+  v16i8 pred = { 0 };

+  v16u8 src_l0, src_l1;

+  v8i16 diff0, diff1;

+  for (loop_cnt = 4; loop_cnt--;) {

+    LD2(src_ptr, src_stride, src0, src1);

+    src_ptr += (2 * src_stride);

+    LD2(pred_ptr, pred_stride, pred0, pred1);

+    pred_ptr += (2 * pred_stride);

+    INSERT_D2_SB(src0, src1, src);

+    INSERT_D2_SB(pred0, pred1, pred);

+    ILVRL_B2_UB(src, pred, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff_ptr, diff_stride);

+    diff_ptr += (2 * diff_stride);

+  }

+}

+static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,

+                              const uint8_t *pred, int32_t pred_stride,

+                              int16_t *diff, int32_t diff_stride) {

+  int8_t count;

+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;

+  v16u8 src_l0, src_l1;

+  v8i16 diff0, diff1;

+  for (count = 2; count--;) {

+    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);

+    src += (8 * src_stride);

+    LD_SB8(pred, pred_stride,

+           pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7);

+    pred += (8 * pred_stride);

+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    diff += diff_stride;

+  }

+}

+static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,

+                              const uint8_t *pred, int32_t pred_stride,

+                              int16_t *diff, int32_t diff_stride) {

+  uint32_t loop_cnt;

+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;

+  v16u8 src_l0, src_l1;

+  v8i16 diff0, diff1;

+  for (loop_cnt = 8; loop_cnt--;) {

+    LD_SB2(src, 16, src0, src1);

+    src += src_stride;

+    LD_SB2(src, 16, src2, src3);

+    src += src_stride;

+    LD_SB2(src, 16, src4, src5);

+    src += src_stride;

+    LD_SB2(src, 16, src6, src7);

+    src += src_stride;

+    LD_SB2(pred, 16, pred0, pred1);

+    pred += pred_stride;

+    LD_SB2(pred, 16, pred2, pred3);

+    pred += pred_stride;

+    LD_SB2(pred, 16, pred4, pred5);

+    pred += pred_stride;

+    LD_SB2(pred, 16, pred6, pred7);

+    pred += pred_stride;

+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 16, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 16, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 16, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 16, 8);

+    diff += diff_stride;

+  }

+}

+static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,

+                              const uint8_t *pred, int32_t pred_stride,

+                              int16_t *diff, int32_t diff_stride) {

+  uint32_t loop_cnt;

+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;

+  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;

+  v16u8 src_l0, src_l1;

+  v8i16 diff0, diff1;

+  for (loop_cnt = 32; loop_cnt--;) {

+    LD_SB4(src, 16, src0, src1, src2, src3);

+    src += src_stride;

+    LD_SB4(src, 16, src4, src5, src6, src7);

+    src += src_stride;

+    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);

+    pred += pred_stride;

+    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);

+    pred += pred_stride;

+    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 16, 8);

+    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 32, 8);

+    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 48, 8);

+    diff += diff_stride;

+    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff, 8);

+    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 16, 8);

+    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 32, 8);

+    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);

+    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);

+    ST_SH2(diff0, diff1, diff + 48, 8);

+    diff += diff_stride;

+  }

+}

+void vpx_subtract_block_msa(int32_t rows, int32_t cols,

+                            int16_t *diff_ptr, ptrdiff_t diff_stride,

+                            const uint8_t *src_ptr, ptrdiff_t src_stride,

+                            const uint8_t *pred_ptr, ptrdiff_t pred_stride) {

+  if (rows == cols) {

+    switch (rows) {

+      case 4:

+        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride,

+                        diff_ptr, diff_stride);

+        break;

+      case 8:

+        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride,

+                        diff_ptr, diff_stride);

+        break;

+      case 16:

+        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride,

+                          diff_ptr, diff_stride);

+        break;

+      case 32:

+        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride,

+                          diff_ptr, diff_stride);

+        break;

+      case 64:

+        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride,

+                          diff_ptr, diff_stride);

+        break;

+      default:

+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,

+                             src_stride, pred_ptr, pred_stride);

+        break;

+    }

+  } else {

+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,

+                         pred_ptr, pred_stride);

+  }

+}

--- /dev/null

+++ b/vpx_dsp/subtract.c

@@ -1,0 +1,56 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include <stdlib.h>

+#include "./vpx_config.h"

+#include "./vpx_dsp_rtcd.h"

+#include "vpx/vpx_integer.h"

+#include "vpx_ports/mem.h"

+void vpx_subtract_block_c(int rows, int cols,

+                          int16_t *diff, ptrdiff_t diff_stride,

+                          const uint8_t *src, ptrdiff_t src_stride,

+                          const uint8_t *pred, ptrdiff_t pred_stride) {

+  int r, c;

+  for (r = 0; r < rows; r++) {

+    for (c = 0; c < cols; c++)

+      diff[c] = src[c] - pred[c];

+    diff += diff_stride;

+    pred += pred_stride;

+    src  += src_stride;

+  }

+}

+#if CONFIG_VP9_HIGHBITDEPTH

+void vpx_highbd_subtract_block_c(int rows, int cols,

+                                 int16_t *diff, ptrdiff_t diff_stride,

+                                 const uint8_t *src8, ptrdiff_t src_stride,

+                                 const uint8_t *pred8, ptrdiff_t pred_stride,

+                                 int bd) {

+  int r, c;

+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);

+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);

+  (void) bd;

+  for (r = 0; r < rows; r++) {

+    for (c = 0; c < cols; c++) {

+      diff[c] = src[c] - pred[c];

+    }

+    diff += diff_stride;

+    pred += pred_stride;

+    src  += src_stride;

+  }

+}

+#endif  // CONFIG_VP9_HIGHBITDEPTH

--- a/vpx_dsp/vpx_dsp.mk

+++ b/vpx_dsp/vpx_dsp.mk

@@ -12,13 +12,16 @@

 ifeq ($(CONFIG_ENCODERS),yes)

 DSP_SRCS-yes            += sad.c

+DSP_SRCS-yes            += subtract.c

 DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)

 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c

 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c

+DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c

 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h

 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c

+DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c

 DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm

 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm

@@ -30,6 +33,7 @@

 ifeq ($(CONFIG_USE_X86INC),yes)

 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm

 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm

+DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm

 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)

 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm

--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl

+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -37,6 +37,12 @@

 if (vpx_config("CONFIG_ENCODERS") eq "yes") {

+# Block subtraction

+#

+add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";

+specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";

+#

 # Single block SAD

 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";

@@ -210,6 +216,12 @@

 specialize qw/vpx_sad4x4x4d msa/, "$sse_x86inc";

 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

+  #

+  # Block subtraction

+  #

+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";

+  specialize qw/vpx_highbd_subtract_block/;

   # Single block SAD

--- /dev/null

+++ b/vpx_dsp/x86/subtract_sse2.asm

@@ -1,0 +1,128 @@

+;

+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

+;

+;  Use of this source code is governed by a BSD-style license

+;  that can be found in the LICENSE file in the root of the source

+;  tree. An additional intellectual property rights grant can be found

+;  in the file PATENTS.  All contributing project authors may

+;  be found in the AUTHORS file in the root of the source tree.

+;

+%define program_name vpx

+%include "third_party/x86inc/x86inc.asm"

+SECTION .text

+; void vpx_subtract_block(int rows, int cols,

+;                         int16_t *diff, ptrdiff_t diff_stride,

+;                         const uint8_t *src, ptrdiff_t src_stride,

+;                         const uint8_t *pred, ptrdiff_t pred_stride)

+INIT_XMM sse2

+cglobal subtract_block, 7, 7, 8, \

+                        rows, cols, diff, diff_stride, src, src_stride, \

+                        pred, pred_stride

+%define pred_str colsq

+  pxor                  m7, m7         ; dedicated zero register

+  cmp                colsd, 4

+  je .case_4

+  cmp                colsd, 8

+  je .case_8

+  cmp                colsd, 16

+  je .case_16

+  cmp                colsd, 32

+  je .case_32

+%macro loop16 6

+  mova                  m0, [srcq+%1]

+  mova                  m4, [srcq+%2]

+  mova                  m1, [predq+%3]

+  mova                  m5, [predq+%4]

+  punpckhbw             m2, m0, m7

+  punpckhbw             m3, m1, m7

+  punpcklbw             m0, m7

+  punpcklbw             m1, m7

+  psubw                 m2, m3

+  psubw                 m0, m1

+  punpckhbw             m1, m4, m7

+  punpckhbw             m3, m5, m7

+  punpcklbw             m4, m7

+  punpcklbw             m5, m7

+  psubw                 m1, m3

+  psubw                 m4, m5

+  mova [diffq+mmsize*0+%5], m0

+  mova [diffq+mmsize*1+%5], m2

+  mova [diffq+mmsize*0+%6], m4

+  mova [diffq+mmsize*1+%6], m1

+%endmacro

+  mov             pred_str, pred_stridemp

+.loop_64:

+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize

+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize

+  lea                diffq, [diffq+diff_strideq*2]

+  add                predq, pred_str

+  add                 srcq, src_strideq

+  dec                rowsd

+  jg .loop_64

+  RET

+.case_32:

+  mov             pred_str, pred_stridemp

+.loop_32:

+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize

+  lea                diffq, [diffq+diff_strideq*2]

+  add                predq, pred_str

+  add                 srcq, src_strideq

+  dec                rowsd

+  jg .loop_32

+  RET

+.case_16:

+  mov             pred_str, pred_stridemp

+.loop_16:

+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2

+  lea                diffq, [diffq+diff_strideq*4]

+  lea                predq, [predq+pred_str*2]

+  lea                 srcq, [srcq+src_strideq*2]

+  sub                rowsd, 2

+  jg .loop_16

+  RET

+%macro loop_h 0

+  movh                  m0, [srcq]

+  movh                  m2, [srcq+src_strideq]

+  movh                  m1, [predq]

+  movh                  m3, [predq+pred_str]

+  punpcklbw             m0, m7

+  punpcklbw             m1, m7

+  punpcklbw             m2, m7

+  punpcklbw             m3, m7

+  psubw                 m0, m1

+  psubw                 m2, m3

+  mova             [diffq], m0

+  mova [diffq+diff_strideq*2], m2

+%endmacro

+.case_8:

+  mov             pred_str, pred_stridemp

+.loop_8:

+  loop_h

+  lea                diffq, [diffq+diff_strideq*4]

+  lea                 srcq, [srcq+src_strideq*2]

+  lea                predq, [predq+pred_str*2]

+  sub                rowsd, 2

+  jg .loop_8

+  RET

+INIT_MMX

+.case_4:

+  mov             pred_str, pred_stridemp

+.loop_4:

+  loop_h

+  lea                diffq, [diffq+diff_strideq*4]

+  lea                 srcq, [srcq+src_strideq*2]

+  lea                predq, [predq+pred_str*2]

+  sub                rowsd, 2

+  jg .loop_4

+  RET

--

⑨